summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSanthosh Thottingal <santhosh.thottingal@gmail.com>2009-05-12 20:40:32 +0530
committerSanthosh Thottingal <santhosh.thottingal@gmail.com>2009-05-12 20:40:32 +0530
commitf1182c8bc123e9b6690f3fbdc0021468e60ba05d (patch)
tree73acf5ce6f6f0a4046f587fdb28be4e97f49302a
parentee0f42962618bb4d3b607cc18248957c72f18961 (diff)
downloadAnjaliOldLipi.git-f1182c8bc123e9b6690f3fbdc0021468e60ba05d.tar.gz
AnjaliOldLipi.git-f1182c8bc123e9b6690f3fbdc0021468e60ba05d.tar.xz
AnjaliOldLipi.git-f1182c8bc123e9b6690f3fbdc0021468e60ba05d.zip
Various fixes on silpa and new modules-Inexact search
-rw-r--r--silpa/common/modulemanager.py8
-rw-r--r--silpa/modules/__init__.py1
-rw-r--r--silpa/modules/dictionary/dictionary.py8
-rw-r--r--silpa/modules/fortune/fortune.py27
-rw-r--r--silpa/modules/hyphenator/hyphenator.py3
-rw-r--r--silpa/modules/hyphenator/rules/hyph_bn_IN.dic5
-rw-r--r--silpa/modules/hyphenator/rules/hyph_ml_IN.dic2
-rw-r--r--silpa/modules/inexactsearch/__init__.py4
-rw-r--r--silpa/modules/inexactsearch/inexactsearch.py173
-rw-r--r--silpa/modules/payyans/payyans.py3
-rw-r--r--silpa/modules/transliterator/transliterate.py51
-rw-r--r--silpa/silpa.conf7
12 files changed, 259 insertions, 33 deletions
diff --git a/silpa/common/modulemanager.py b/silpa/common/modulemanager.py
index a3e9f1b..c6185dd 100644
--- a/silpa/common/modulemanager.py
+++ b/silpa/common/modulemanager.py
@@ -35,15 +35,15 @@ class ModuleManager:
def getModulesInfoAsHTML(self):
module_dict=getModulesList ()
response = "<h2>Available Modules</h2></hr>"
- response = response+"<table class=\"table1\"><tr><th>Module</th><th>Description</th><th>Status</th></tr>"
+ response = response+"<table class=\"table1\"><tr><th>Module</th><th>Description</th></tr>"
for action in module_dict:
module_instance=self.getModuleInstance(action)
if(module_instance!=None):
- response = response+"<tr><td><a href='?action="+ action +"'>"+module_instance.get_module_name()+"</a></td>"
- response = response+"<td>"+module_instance.get_info()+"</td><td>OK</td></tr>"
+ response = response+"<tr><td><a href='"+ action +"'>"+module_instance.get_module_name()+"</a></td>"
+ response = response+"<td>"+module_instance.get_info()+"</td></tr>"
else:
response = response+"<tr><td>"+action.replace("_"," ")+"</td>"
- response = response+"<td>Error while retrieving module details</td><td>Fail</td></tr>"
+ response = response+"<td>Error while retrieving module details</td></tr>"
return response+"</table>"
if __name__ == '__main__':
mm=ModuleManager()
diff --git a/silpa/modules/__init__.py b/silpa/modules/__init__.py
index 4fe98d1..78f7b19 100644
--- a/silpa/modules/__init__.py
+++ b/silpa/modules/__init__.py
@@ -10,4 +10,5 @@ from fortune import *
from inexactsearch import *
from dictionary import *
from anagram import *
+from normalizer import *
diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py
index 7ba877a..4de0f76 100644
--- a/silpa/modules/dictionary/dictionary.py
+++ b/silpa/modules/dictionary/dictionary.py
@@ -28,6 +28,7 @@ import pickle
class Dictionary(SilpaModule):
def lookup_en_ml(self, key):
+ key=key.lower()
self.dictFile=os.path.dirname(__file__) + "/data/dict.dat"
pickled_dict=open(self.dictFile,'r')
self.dictionary=pickle.load(pickled_dict)
@@ -44,14 +45,15 @@ class Dictionary(SilpaModule):
<h2>English Malayalam Dictionary</h2></hr>
<p>Enter the word to lookup in the dictionary
</p>
- <form action="" method="get">
+ <form action="" method="post">
<input type="text" value="%s" name="word"/>
- <input type="submit" id="Find_Meaning" value="Find Meaning" name="action" style="width:12em;"/>
+ <input type="hidden" name="action" value="Dictionary">
+ <input type="submit" id="Find_Meaning" value="Find Meaning" style="width:12em;"/>
</br>
</form>
"""
if(form.has_key('word')):
- search_key = form['word'].value.decode('utf-8')
+ search_key = form['word'].value
response=response % search_key
response = response+"<h2>Search Results</h2></hr>"
if(search_key==None):
diff --git a/silpa/modules/fortune/fortune.py b/silpa/modules/fortune/fortune.py
index cf31aad..bc83b51 100644
--- a/silpa/modules/fortune/fortune.py
+++ b/silpa/modules/fortune/fortune.py
@@ -1,7 +1,7 @@
# Fortune
# -*- coding: utf-8 -*-
#
-# Copyright © 2008 Santhosh Thottingal
+# Copyright © 2009 Santhosh Thottingal <santhosh.thottingal@gmai.com>
# Released under the GPLV3+ license
import os,random
@@ -13,24 +13,24 @@ class Fortune(SilpaModule):
for line in infile:
line=line.decode("utf-8")
if line == "%\n":
- yield result
- result = []
+ continue
else:
if(pattern==None):
result.append(line)
else:
- if(line.find(pattern)==-1):
+ if(line.find(pattern)>0):
result.append(line)
if result:
- yield result
+ return result
+
- def fortune_ml(self, word):
- filename="./modules/fortune/database/fortune-ml"
+ def fortune_ml(self, pattern):
+ filename = os.path.join(os.path.dirname(__file__), 'database/fortune-ml')
""" Pick a random fortune from a file """
- for index, fortune in enumerate(self.fortunes(file(filename),None)):
- if random.random() < (1.0 / (index+1)):
- chosen = fortune
-
+ fortunes_list=self.fortunes(file(filename),pattern)
+ chosen=""
+ if fortunes_list:
+ chosen= random.choice(fortunes_list)
return "".join(chosen)
def process(self, form):
@@ -46,9 +46,10 @@ class Fortune(SilpaModule):
"""
if(form.has_key('input_text')):
text = form['input_text'].value .decode('utf-8')
+ response=response % text
else:
- text=""
- response=response % text
+ text= None
+ response=response % ""
result = self.fortune_ml(text)
response = response+"<h2>Random Quote</h2></hr>"
response = response+"<b>"+result+"</b>"
diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py
index f5e8fe2..889aee3 100644
--- a/silpa/modules/hyphenator/hyphenator.py
+++ b/silpa/modules/hyphenator/hyphenator.py
@@ -234,9 +234,8 @@ class Hyphenator(SilpaModule):
You can give the text in any language and even with mixed language
</p>
<form action="" method="post">
- <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea>
+ <textarea name='input_text' id='id1'>%s</textarea>
<input type="submit" id="Hyphenate" value="Hyphenate" name="action" style="width:12em;"/>
- <input type="reset" value="Clear" style="width:12em;"/>
</br>
</form>
"""
diff --git a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic
index 52acee5..42dca20 100644
--- a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic
+++ b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic
@@ -37,8 +37,9 @@ UTF-8
ৌ1
ৗ1
্2
-ঃ1
-ং1
+2ঃ1
+2ং1
+2ঁ1
1ন
ন্2
2ন্‍
diff --git a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic
index 44b50b9..36a70f2 100644
--- a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic
+++ b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic
@@ -43,7 +43,7 @@ UTF-8
ൌ1
ൗ1
്2
-ഃ1
+2ഃ1
2ം1
1ന
ന്2
diff --git a/silpa/modules/inexactsearch/__init__.py b/silpa/modules/inexactsearch/__init__.py
new file mode 100644
index 0000000..d47e9c0
--- /dev/null
+++ b/silpa/modules/inexactsearch/__init__.py
@@ -0,0 +1,4 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+import inexactsearch
+
diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py
new file mode 100644
index 0000000..0d1f35b
--- /dev/null
+++ b/silpa/modules/inexactsearch/inexactsearch.py
@@ -0,0 +1,173 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Paralperu
+# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
+# URL: http://www.smc.org.in
+
+
+import sys
+import re
+from common import *
+
+class ApproximateSearch(SilpaModule):
+
+ def syllabalize_ml(self, text):
+ signs = [
+ u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+ u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+ u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ chandrakkala = u'\u0d4d'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == chandrakkala:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+
+
+ def bigram_search(self, str1, str2, syllable_search=False):
+ """Return approximate string comparator measure (between 0.0 and 1.0)
+ using bigrams.
+ USAGE:
+ score = bigram(str1, str2)
+
+ ARGUMENTS:
+ str1 The first string
+ str2 The second string
+
+ DESCRIPTION:
+ Bigrams are two-character sub-strings contained in a string. For example,
+ 'peter' contains the bigrams: pe,et,te,er.
+
+ This routine counts the number of common bigrams and divides by the
+ average number of bigrams. The resulting number is returned.
+ """
+
+ # Quick check if the strings are the same - - - - - - - - - - - - - - - - - -
+ #
+ if (str1 == str2):
+ result_string = "<div style='float: left; background-color: green;' title=\" Bigram comparator : string1: %s, string2: %s. Exact Match found" % (str1, str2)
+ result_string = result_string + "\">"+str1+ "</div>"
+ return result_string
+
+ bigr1 = []
+ bigr2 = []
+
+ # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - -
+ #
+ for i in range(1,len(str1)):
+ bigr1.append(str1[i-1:i+1])
+ for i in range(1,len(str2)):
+ bigr2.append(str2[i-1:i+1])
+
+ # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - -
+ #
+ average = (len(bigr1)+len(bigr2)) / 2.0
+ if (average == 0.0):
+ return str1
+
+ # Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ #
+ common = 0.0
+
+ if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list
+ short_bigr = bigr1
+ long_bigr = bigr2
+ else:
+ short_bigr = bigr2
+ long_bigr = bigr1
+
+ for b in short_bigr:
+ if (b in long_bigr):
+ common += 1.0
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+
+ w = common / average
+ if(w>=0.6):
+ result_string = "<div style='float: left; background-color: yellow;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2)
+ else:
+ if((w>0.4) & (w<0.6)):
+ result_string = "<div style='float: left; background-color: grey;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2)
+ else:
+ result_string = "<div style='float: left;' title=\" Bigram comparator string1: %s, string2: %s" % (str1, str2)
+ result_string = result_string + " Number of bigrams in String1: %i" % (len(bigr1))
+ result_string = result_string + " Number of bigrams in String2: %i" % (len(bigr2))
+ result_string = result_string + " Average: %i" % (average)
+ result_string = result_string + " Common: %i" % (common)
+ result_string = result_string + " Final approximate string weight: " + str(w)
+ result_string = result_string + "\">"+str1+ "</div>"
+ return result_string
+ def process(self,form):
+ response = """
+ <h2>Inexact Search</h2></hr>
+ <p>The search performed by search engines on Indic text is not effective.
+ It does not take care of the inflective or agglutinative nature of the language.
+ This application tries to solve that by using an inexact search algorithm based on maximum common bigram algorithm.
+
+ </p>
+ <p>Enter the text for searching in the below text area.
+ </p>
+ <form action="" method="post">
+ <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea>
+ <br/>
+ <input type="text" name="search_key" value="%s"/>
+ <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/>
+ </br>
+ </form>
+ """
+ if(form.has_key('input_text')):
+ text = action=form['input_text'].value .decode('utf-8')
+ if(form.has_key('search_key')):
+ key = action=form['search_key'].value .decode('utf-8')
+ response=response % (text,key)
+ words=text.split(" ")
+ response = response+"<h2>Search Results</h2></hr>"
+ response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match."
+ response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>"
+ else:
+ response = response+ "Enter a string to search."
+ return response % (text,"")
+ for word in words:
+ word=word.strip()
+ if(word>""):
+ response = response+ self.bigram_search(word, key)
+ response = response+ "<div style='float: left;'>&nbsp;</div>"
+ else:
+ response=response % ("","")
+ return response
+ def get_module_name(self):
+ return "Approximate Search"
+ def get_info(self):
+ return "Approximate Search for a string in the given text. Based on bigram search algorithm"
+
+def getInstance():
+ return ApproximateSearch()
diff --git a/silpa/modules/payyans/payyans.py b/silpa/modules/payyans/payyans.py
index 1702307..8751347 100644
--- a/silpa/modules/payyans/payyans.py
+++ b/silpa/modules/payyans/payyans.py
@@ -273,7 +273,7 @@ class Payyans(SilpaModule):
<p>Enter the text for detecting the language in the below text area.
</p>
<form action="" method="post">
- <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea><br/>
+ <textarea name='input_text' id='id1'>%s</textarea><br/>
Select Font : <select id="font" name="%s" style="width:12em;">
<option value="karthika">Karthika</option>
<option value="bhavana">Bhavana</option>
@@ -282,7 +282,6 @@ class Payyans(SilpaModule):
<option value="manorama">Manorama</option>
</select>
<input type="submit" id="Convert To Unicode" value="%s" name="action" style="width:12em;"/>
- <input type="reset" value="Clear" style="width:12em;"/>
</br>
</form>
"""
diff --git a/silpa/modules/transliterator/transliterate.py b/silpa/modules/transliterator/transliterate.py
index 206a6bf..1b534cf 100644
--- a/silpa/modules/transliterator/transliterate.py
+++ b/silpa/modules/transliterator/transliterate.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# Paralperu
+# Any Indian Language to any other Indian language transliterator
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
@@ -23,6 +23,48 @@
from common import *
class Transliterator(SilpaModule):
+ def transliterate_ml_en(self, word):
+ virama=u"്"
+ #TODO: how to make this more generic so that more languages can be handled here?
+ #idea1: transliterate any langauge to a common language say hindi and the n do conversion?
+ #existing transliterate.py can be used?
+ #idea2: Have dictionaries for each language like english_xx_dict ?
+ #TODO: complete this
+ english_ml_dict={u'അ':'a',u'ആ':'a',u'ഇ':'a',u'ഈ':'a',u'ഉ':'a',u'ഊ':'a',u'ഋ':'a',\
+ u'എ':'a',u'ഏ':'a',u'ഐ':'a',u'ഒ':'a',u'ഓ':'a',u'ഔ':'a',\
+ u'ക':'k',u'ഖ':'kh',u'ഗ':'g',u'ഘ':'gh',u'ങ്ങ':'ng',\
+ u'ച':'ch',u'ഛ':'chh',u'ജ':'j',u'ഝ':'jhh',u'ഞ':'nj',\
+ u'ട':'t',u'ഠ':'th',u'ഡ':'d',u'ഢ':'dh',u'ണ':'n',\
+ u'ത':'th',u'ഥ':'th',u'ദ':'d',u'ധ':'dh',u'ന':'n',\
+ u'പ':'p',u'ഫ':'ph',u'ബ':'b',u'ഭ':'bh',u'മ':'m',\
+ u'യ':'y',u'ര':'r',u'ല':'l', u'വ':'v', u'റ':'r',\
+ u'ശ':'sa',u'ഷ':'sh',u'സ':'s', u'ഹ':'h',u'ള':'l',u'ഴ':'zh',\
+ u'ാ':'a',u'ി':'i' ,u'ീ':'ee' ,u'ു':'u',\
+ u'ൂ':'uu',u'ൃ':'ri' ,u'െ':'e' ,u'േ':'e',\
+ u'ൈ':'ai',u'ൊ':'o' ,u'ോ':'oo' ,u'ൗ':'au'}
+ word_length =len(word)
+ index=0
+ tx_string=""
+ while index<word_length:
+ a_vowel=""
+ try:
+ if(index+1<word_length):
+ if(word[index+1]==virama):
+ a_vowel=""
+ else:
+ if(index+1<word_length):
+ if (english_ml_dict[word[index+1]] in ['a','e','i','o','u']):
+ a_vowel=""
+ else:
+ a_vowel="a"
+ if (english_ml_dict[word[index]] in ['a','e','i','o','u']):
+ a_vowel=""
+ tx_string=tx_string+ english_ml_dict[word[index]] + a_vowel
+ except:
+ tx_string=tx_string+ word[index]
+ index=index+1
+ return tx_string
+
def transliterate(self,text, target_lang_code):
mm=ModuleManager()
ld = mm.getModuleInstance("Detect Language")
@@ -31,7 +73,9 @@ class Transliterator(SilpaModule):
for word in words:
if(word.strip()>""):
src_lang_code=ld.detect_lang(word)[word]
- tx_str = tx_str
+ if((target_lang_code=="en_US") and (src_lang_code=="ml_IN")):
+ tx_str=tx_str + self.transliterate_ml_en(word)
+ continue
for chr in word:
offset=ord(chr) + self.getOffset(src_lang_code, target_lang_code)
if(offset>0):
@@ -60,7 +104,7 @@ class Transliterator(SilpaModule):
You can give the text in any language and even with mixed language
</p>
<form action="" method="post">
- <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea></br>
+ <textarea name='input_text' id='id1'>%s</textarea></br>
<select id="trans-lang" name="trans-lang" style="width:12em;">
<option value="hi_IN">Hindi</option>
<option value="ml_IN">Malayalam</option>
@@ -71,6 +115,7 @@ class Transliterator(SilpaModule):
<option value="gu_IN">Gujarai</option>
<option value="pa_IN">Panjabi</option>
<option value="ka_IN">Kannada</option>
+ <option value="en_US">English</option>
</select>
<input type="submit" id="Transliterate" value="Transliterate" name="action" style="width:12em;"/>
<input type="reset" value="Clear" style="width:12em;"/>
diff --git a/silpa/silpa.conf b/silpa/silpa.conf
index 068112c..a70a482 100644
--- a/silpa/silpa.conf
+++ b/silpa/silpa.conf
@@ -1,13 +1,13 @@
# Silpa configuration file
# This is a comment
# Global properties
-SILPA_BASE_URL=http://planet.smc.org.in/exp/silpa/index.py
+SILPA_BASE_URL=http://smc.org.in/silpa
SILPA_TEMPLATE=./templates/default/silpa.html
SILPA_SITE_NAME=Silpa
SILPA_LANGUAGE=Silpa
SILPA_SITE_ADMIN_NAME=Santhosh
SILPA_SITE_ADMIN_EMAIL=santhosh@silpa.org
-SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved.
+SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved.
#Silpa Plugin Modules
#Format: action=module. For eg:
@@ -16,6 +16,7 @@ SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved.
#An example entry
#SILPA_ACTION.Sort=modules.sort #comment at the end of line is also allowed
SILPA_ACTION.Transliterate=modules.transliterate #transliterator module
+SILPA_ACTION.Normalize=modules.normalizer #transliterator module
SILPA_ACTION.Stem=modules.stemmer #stemmer module for Indian Languages
SILPA_ACTION.Detect_Language=utils #This is also a valid module
SILPA_ACTION.Guess_Language=modules.guess_language #This is also a valid module
@@ -23,7 +24,7 @@ SILPA_ACTION.To_Unicode=modules.payyans #Ascii to Unicode conversion Module
SILPA_ACTION.To_ASCII=modules.payyans #Unicode to Ascii conversion Module
SILPA_ACTION.Syllabalize=modules.syllabalizer #Syllabalizer module
SILPA_ACTION.Hyphenate=modules.hyphenator #Syllabalizer module
-SILPA_ACTION.Find_Meaning=modules.dictionary #English Malayalam Module
+SILPA_ACTION.Dictionary=modules.dictionary #English Malayalam Dictionary Module
SILPA_ACTION.Fortune=modules.fortune #Syllabalizer module
SILPA_ACTION.Anagram=modules.anagram #Anagram module
SILPA_ACTION.Approximate_Search=modules.inexactsearch #Approximate search