From f1182c8bc123e9b6690f3fbdc0021468e60ba05d Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Tue, 12 May 2009 20:40:32 +0530 Subject: Various fixes on silpa and new modules-Inexact search --- silpa/common/modulemanager.py | 8 +- silpa/modules/__init__.py | 1 + silpa/modules/dictionary/dictionary.py | 8 +- silpa/modules/fortune/fortune.py | 27 ++-- silpa/modules/hyphenator/hyphenator.py | 3 +- silpa/modules/hyphenator/rules/hyph_bn_IN.dic | 5 +- silpa/modules/hyphenator/rules/hyph_ml_IN.dic | 2 +- silpa/modules/inexactsearch/__init__.py | 4 + silpa/modules/inexactsearch/inexactsearch.py | 173 ++++++++++++++++++++++++++ silpa/modules/payyans/payyans.py | 3 +- silpa/modules/transliterator/transliterate.py | 51 +++++++- silpa/silpa.conf | 7 +- 12 files changed, 259 insertions(+), 33 deletions(-) create mode 100644 silpa/modules/inexactsearch/__init__.py create mode 100644 silpa/modules/inexactsearch/inexactsearch.py (limited to 'silpa') diff --git a/silpa/common/modulemanager.py b/silpa/common/modulemanager.py index a3e9f1b..c6185dd 100644 --- a/silpa/common/modulemanager.py +++ b/silpa/common/modulemanager.py @@ -35,15 +35,15 @@ class ModuleManager: def getModulesInfoAsHTML(self): module_dict=getModulesList () response = "

Available Modules

" - response = response+"" + response = response+"
ModuleDescriptionStatus
" for action in module_dict: module_instance=self.getModuleInstance(action) if(module_instance!=None): - response = response+"" - response = response+"" + response = response+"" + response = response+"" else: response = response+"" - response = response+"" + response = response+"" return response+"
ModuleDescription
"+module_instance.get_module_name()+""+module_instance.get_info()+"OK
"+module_instance.get_module_name()+""+module_instance.get_info()+"
"+action.replace("_"," ")+"Error while retrieving module detailsFail
Error while retrieving module details
" if __name__ == '__main__': mm=ModuleManager() diff --git a/silpa/modules/__init__.py b/silpa/modules/__init__.py index 4fe98d1..78f7b19 100644 --- a/silpa/modules/__init__.py +++ b/silpa/modules/__init__.py @@ -10,4 +10,5 @@ from fortune import * from inexactsearch import * from dictionary import * from anagram import * +from normalizer import * diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py index 7ba877a..4de0f76 100644 --- a/silpa/modules/dictionary/dictionary.py +++ b/silpa/modules/dictionary/dictionary.py @@ -28,6 +28,7 @@ import pickle class Dictionary(SilpaModule): def lookup_en_ml(self, key): + key=key.lower() self.dictFile=os.path.dirname(__file__) + "/data/dict.dat" pickled_dict=open(self.dictFile,'r') self.dictionary=pickle.load(pickled_dict) @@ -44,14 +45,15 @@ class Dictionary(SilpaModule):

English Malayalam Dictionary

Enter the word to lookup in the dictionary

-
+ - + +
""" if(form.has_key('word')): - search_key = form['word'].value.decode('utf-8') + search_key = form['word'].value response=response % search_key response = response+"

Search Results

" if(search_key==None): diff --git a/silpa/modules/fortune/fortune.py b/silpa/modules/fortune/fortune.py index cf31aad..bc83b51 100644 --- a/silpa/modules/fortune/fortune.py +++ b/silpa/modules/fortune/fortune.py @@ -1,7 +1,7 @@ # Fortune # -*- coding: utf-8 -*- # -# Copyright © 2008 Santhosh Thottingal +# Copyright © 2009 Santhosh Thottingal # Released under the GPLV3+ license import os,random @@ -13,24 +13,24 @@ class Fortune(SilpaModule): for line in infile: line=line.decode("utf-8") if line == "%\n": - yield result - result = [] + continue else: if(pattern==None): result.append(line) else: - if(line.find(pattern)==-1): + if(line.find(pattern)>0): result.append(line) if result: - yield result + return result + - def fortune_ml(self, word): - filename="./modules/fortune/database/fortune-ml" + def fortune_ml(self, pattern): + filename = os.path.join(os.path.dirname(__file__), 'database/fortune-ml') """ Pick a random fortune from a file """ - for index, fortune in enumerate(self.fortunes(file(filename),None)): - if random.random() < (1.0 / (index+1)): - chosen = fortune - + fortunes_list=self.fortunes(file(filename),pattern) + chosen="" + if fortunes_list: + chosen= random.choice(fortunes_list) return "".join(chosen) def process(self, form): @@ -46,9 +46,10 @@ class Fortune(SilpaModule): """ if(form.has_key('input_text')): text = form['input_text'].value .decode('utf-8') + response=response % text else: - text="" - response=response % text + text= None + response=response % "" result = self.fortune_ml(text) response = response+"

Random Quote

" response = response+""+result+"" diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py index f5e8fe2..889aee3 100644 --- a/silpa/modules/hyphenator/hyphenator.py +++ b/silpa/modules/hyphenator/hyphenator.py @@ -234,9 +234,8 @@ class Hyphenator(SilpaModule): You can give the text in any language and even with mixed language

- + -
""" diff --git a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic index 52acee5..42dca20 100644 --- a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic +++ b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic @@ -37,8 +37,9 @@ UTF-8 ৌ1 ৗ1 ্2 -ঃ1 -ং1 +2ঃ1 +2ং1 +2ঁ1 1ন ন্2 2ন্‍ diff --git a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic index 44b50b9..36a70f2 100644 --- a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic +++ b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic @@ -43,7 +43,7 @@ UTF-8 ൌ1 ൗ1 ്2 -ഃ1 +2ഃ1 2ം1 1ന ന്2 diff --git a/silpa/modules/inexactsearch/__init__.py b/silpa/modules/inexactsearch/__init__.py new file mode 100644 index 0000000..d47e9c0 --- /dev/null +++ b/silpa/modules/inexactsearch/__init__.py @@ -0,0 +1,4 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +import inexactsearch + diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py new file mode 100644 index 0000000..0d1f35b --- /dev/null +++ b/silpa/modules/inexactsearch/inexactsearch.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Paralperu +# Copyright 2008 Santhosh Thottingal +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in + + +import sys +import re +from common import * + +class ApproximateSearch(SilpaModule): + + def syllabalize_ml(self, text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?'] + + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + + + def bigram_search(self, str1, str2, syllable_search=False): + """Return approximate string comparator measure (between 0.0 and 1.0) + using bigrams. + USAGE: + score = bigram(str1, str2) + + ARGUMENTS: + str1 The first string + str2 The second string + + DESCRIPTION: + Bigrams are two-character sub-strings contained in a string. For example, + 'peter' contains the bigrams: pe,et,te,er. + + This routine counts the number of common bigrams and divides by the + average number of bigrams. The resulting number is returned. + """ + + # Quick check if the strings are the same - - - - - - - - - - - - - - - - - - + # + if (str1 == str2): + result_string = "
"+str1+ "
" + return result_string + + bigr1 = [] + bigr2 = [] + + # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - - + # + for i in range(1,len(str1)): + bigr1.append(str1[i-1:i+1]) + for i in range(1,len(str2)): + bigr2.append(str2[i-1:i+1]) + + # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - - + # + average = (len(bigr1)+len(bigr2)) / 2.0 + if (average == 0.0): + return str1 + + # Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # + common = 0.0 + + if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list + short_bigr = bigr1 + long_bigr = bigr2 + else: + short_bigr = bigr2 + long_bigr = bigr1 + + for b in short_bigr: + if (b in long_bigr): + common += 1.0 + long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted + + w = common / average + if(w>=0.6): + result_string = "
0.4) & (w<0.6)): + result_string = "
"+str1+ "
" + return result_string + def process(self,form): + response = """ +

Inexact Search

+

The search performed by search engines on Indic text is not effective. + It does not take care of the inflective or agglutinative nature of the language. + This application tries to solve that by using an inexact search algorithm based on maximum common bigram algorithm. + +

+

Enter the text for searching in the below text area. +

+
+ +
+ + +
+
+ """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + if(form.has_key('search_key')): + key = action=form['search_key'].value .decode('utf-8') + response=response % (text,key) + words=text.split(" ") + response = response+"

Search Results

" + response = response+"

Words in green are with exact match. Words in Yellow are with approximate Match." + response = response+" Move your mouse pointer over the words to get more information on matching.

" + else: + response = response+ "Enter a string to search." + return response % (text,"") + for word in words: + word=word.strip() + if(word>""): + response = response+ self.bigram_search(word, key) + response = response+ "
 
" + else: + response=response % ("","") + return response + def get_module_name(self): + return "Approximate Search" + def get_info(self): + return "Approximate Search for a string in the given text. Based on bigram search algorithm" + +def getInstance(): + return ApproximateSearch() diff --git a/silpa/modules/payyans/payyans.py b/silpa/modules/payyans/payyans.py index 1702307..8751347 100644 --- a/silpa/modules/payyans/payyans.py +++ b/silpa/modules/payyans/payyans.py @@ -273,7 +273,7 @@ class Payyans(SilpaModule):

Enter the text for detecting the language in the below text area.

-
+
Select Font : -
""" diff --git a/silpa/modules/transliterator/transliterate.py b/silpa/modules/transliterator/transliterate.py index 206a6bf..1b534cf 100644 --- a/silpa/modules/transliterator/transliterate.py +++ b/silpa/modules/transliterator/transliterate.py @@ -1,6 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -# Paralperu +# Any Indian Language to any other Indian language transliterator # Copyright 2008 Santhosh Thottingal # http://www.smc.org.in # @@ -23,6 +23,48 @@ from common import * class Transliterator(SilpaModule): + def transliterate_ml_en(self, word): + virama=u"്" + #TODO: how to make this more generic so that more languages can be handled here? + #idea1: transliterate any langauge to a common language say hindi and the n do conversion? + #existing transliterate.py can be used? + #idea2: Have dictionaries for each language like english_xx_dict ? + #TODO: complete this + english_ml_dict={u'അ':'a',u'ആ':'a',u'ഇ':'a',u'ഈ':'a',u'ഉ':'a',u'ഊ':'a',u'ഋ':'a',\ + u'എ':'a',u'ഏ':'a',u'ഐ':'a',u'ഒ':'a',u'ഓ':'a',u'ഔ':'a',\ + u'ക':'k',u'ഖ':'kh',u'ഗ':'g',u'ഘ':'gh',u'ങ്ങ':'ng',\ + u'ച':'ch',u'ഛ':'chh',u'ജ':'j',u'ഝ':'jhh',u'ഞ':'nj',\ + u'ട':'t',u'ഠ':'th',u'ഡ':'d',u'ഢ':'dh',u'ണ':'n',\ + u'ത':'th',u'ഥ':'th',u'ദ':'d',u'ധ':'dh',u'ന':'n',\ + u'പ':'p',u'ഫ':'ph',u'ബ':'b',u'ഭ':'bh',u'മ':'m',\ + u'യ':'y',u'ര':'r',u'ല':'l', u'വ':'v', u'റ':'r',\ + u'ശ':'sa',u'ഷ':'sh',u'സ':'s', u'ഹ':'h',u'ള':'l',u'ഴ':'zh',\ + u'ാ':'a',u'ി':'i' ,u'ീ':'ee' ,u'ു':'u',\ + u'ൂ':'uu',u'ൃ':'ri' ,u'െ':'e' ,u'േ':'e',\ + u'ൈ':'ai',u'ൊ':'o' ,u'ോ':'oo' ,u'ൗ':'au'} + word_length =len(word) + index=0 + tx_string="" + while index""): src_lang_code=ld.detect_lang(word)[word] - tx_str = tx_str + if((target_lang_code=="en_US") and (src_lang_code=="ml_IN")): + tx_str=tx_str + self.transliterate_ml_en(word) + continue for chr in word: offset=ord(chr) + self.getOffset(src_lang_code, target_lang_code) if(offset>0): @@ -60,7 +104,7 @@ class Transliterator(SilpaModule): You can give the text in any language and even with mixed language

-
+
diff --git a/silpa/silpa.conf b/silpa/silpa.conf index 068112c..a70a482 100644 --- a/silpa/silpa.conf +++ b/silpa/silpa.conf @@ -1,13 +1,13 @@ # Silpa configuration file # This is a comment # Global properties -SILPA_BASE_URL=http://planet.smc.org.in/exp/silpa/index.py +SILPA_BASE_URL=http://smc.org.in/silpa SILPA_TEMPLATE=./templates/default/silpa.html SILPA_SITE_NAME=Silpa SILPA_LANGUAGE=Silpa SILPA_SITE_ADMIN_NAME=Santhosh SILPA_SITE_ADMIN_EMAIL=santhosh@silpa.org -SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. +SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. #Silpa Plugin Modules #Format: action=module. For eg: @@ -16,6 +16,7 @@ SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. #An example entry #SILPA_ACTION.Sort=modules.sort #comment at the end of line is also allowed SILPA_ACTION.Transliterate=modules.transliterate #transliterator module +SILPA_ACTION.Normalize=modules.normalizer #transliterator module SILPA_ACTION.Stem=modules.stemmer #stemmer module for Indian Languages SILPA_ACTION.Detect_Language=utils #This is also a valid module SILPA_ACTION.Guess_Language=modules.guess_language #This is also a valid module @@ -23,7 +24,7 @@ SILPA_ACTION.To_Unicode=modules.payyans #Ascii to Unicode conversion Module SILPA_ACTION.To_ASCII=modules.payyans #Unicode to Ascii conversion Module SILPA_ACTION.Syllabalize=modules.syllabalizer #Syllabalizer module SILPA_ACTION.Hyphenate=modules.hyphenator #Syllabalizer module -SILPA_ACTION.Find_Meaning=modules.dictionary #English Malayalam Module +SILPA_ACTION.Dictionary=modules.dictionary #English Malayalam Dictionary Module SILPA_ACTION.Fortune=modules.fortune #Syllabalizer module SILPA_ACTION.Anagram=modules.anagram #Anagram module SILPA_ACTION.Approximate_Search=modules.inexactsearch #Approximate search -- cgit