summaryrefslogtreecommitdiffstats
path: root/silpa/modules
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules')
-rwxr-xr-x[-rw-r--r--]silpa/modules/dictionary/dictionary.py47
-rwxr-xr-x[-rw-r--r--]silpa/modules/inexactsearch/inexactsearch.py102
-rwxr-xr-x[-rw-r--r--]silpa/modules/syllabalizer/syllabalizer.py58
3 files changed, 143 insertions, 64 deletions
diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py
index 4de0f76..927c06c 100644..100755
--- a/silpa/modules/dictionary/dictionary.py
+++ b/silpa/modules/dictionary/dictionary.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# English Malayalam Dictionary
+# Dictionary
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
@@ -24,49 +24,56 @@
from common import *
import os
-import pickle
+from dictdlib import DictDB
class Dictionary(SilpaModule):
- def lookup_en_ml(self, key):
- key=key.lower()
- self.dictFile=os.path.dirname(__file__) + "/data/dict.dat"
- pickled_dict=open(self.dictFile,'r')
- self.dictionary=pickle.load(pickled_dict)
- meaning=""
- if self.dictionary.has_key(key):
- meaningList=self.dictionary[key]
- for meaning_item in meaningList:
- meaning=meaning+meaning_item.strip() +"<br/>"
- else :
- meaning="No Meaning found"
- return meaning.decode('utf-8')
+ def getdef(self, word, dictionary):
+ dict_dir=os.path.join(os.path.dirname(__file__), 'dictionaries')
+ dictdata=dict_dir+ "/"+dictionary
+ dict=DictDB(dictdata)
+ meanings = dict.getdef(word)
+ meaningstring= ""
+ if (meanings==None):
+ meaningstring = "No definition found"
+ return meaningstring
+ for meaning in meanings:
+ meaningstring += meaning
+ return meaningstring.decode("utf-8")
def process(self,form):
response = """
- <h2>English Malayalam Dictionary</h2></hr>
+ <h2>Dictionary</h2></hr>
<p>Enter the word to lookup in the dictionary
</p>
<form action="" method="post">
- <input type="text" value="%s" name="word"/>
+ <p align="center">
+ Word : <input type="text" value="%s" name="word"/>
+ Dictionary :<select id="dictionary" name="dictionary" style="width:12em;">
+ <option value="freedict-eng-hin">English-Hindi</option>
+ <option value="freedict-eng-mal">English-Malayalam</option>
+ </select>
<input type="hidden" name="action" value="Dictionary">
+ </br>
<input type="submit" id="Find_Meaning" value="Find Meaning" style="width:12em;"/>
</br>
+ </p>
</form>
"""
if(form.has_key('word')):
search_key = form['word'].value
+ dictionary = form['dictionary'].value
response=response % search_key
response = response+"<h2>Search Results</h2></hr>"
if(search_key==None):
response = response+ "Enter a word to find meaning."
else:
- response = response+ self.lookup_en_ml(search_key)
+ response = response+ "<pre> "+ self.getdef(search_key,dictionary) + "</pre> "
else:
response=response % ""
return response
def get_module_name(self):
- return "English Malayalam Dictionary"
+ return "Dictionary"
def get_info(self):
- return "English Malayalam Dictionary. Dictionary is compiled by Kerala state IT Mission"
+ return "Bilingual Dictionaries"
def getInstance():
return Dictionary()
diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py
index 0d1f35b..67e47d9 100644..100755
--- a/silpa/modules/inexactsearch/inexactsearch.py
+++ b/silpa/modules/inexactsearch/inexactsearch.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# Paralperu
+# Approximate Search
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
@@ -28,32 +28,11 @@ from common import *
class ApproximateSearch(SilpaModule):
- def syllabalize_ml(self, text):
- signs = [
- u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
- u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
- u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
- limiters = ['.','\"','\'','`','!',';',',','?']
-
- chandrakkala = u'\u0d4d'
- lst_chars = []
- for char in text:
- if char in limiters:
- lst_chars.append(char)
- elif char in signs:
- lst_chars[-1] = lst_chars[-1] + char
- else:
- try:
- if lst_chars[-1][-1] == chandrakkala:
- lst_chars[-1] = lst_chars[-1] + char
- else:
- lst_chars.append(char)
- except IndexError:
- lst_chars.append(char)
-
- return lst_chars
-
-
+ def syllabalize(self, text):
+ mm=ModuleManager()
+ syllabalizer = mm.getModuleInstance("Syllabalize")
+ return syllabalizer.syllabalize(text)
+
def bigram_search(self, str1, str2, syllable_search=False):
"""Return approximate string comparator measure (between 0.0 and 1.0)
using bigrams.
@@ -84,10 +63,19 @@ class ApproximateSearch(SilpaModule):
# Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - -
#
- for i in range(1,len(str1)):
- bigr1.append(str1[i-1:i+1])
- for i in range(1,len(str2)):
- bigr2.append(str2[i-1:i+1])
+ if(syllable_search):
+ str1_syllables = self. syllabalize(str1)
+ str2_syllables = self. syllabalize(str2)
+ for i in range(1,len(str1_syllables)):
+ bigr1.append(str1_syllables[i-1:i+1])
+ for i in range(1,len(str2_syllables)):
+ bigr2.append(str2_syllables[i-1:i+1])
+ else:
+ for i in range(1,len(str1)):
+ bigr1.append(str1[i-1:i+1])
+ for i in range(1,len(str2)):
+ bigr2.append(str2[i-1:i+1])
+
# Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - -
#
@@ -105,11 +93,22 @@ class ApproximateSearch(SilpaModule):
else:
short_bigr = bigr2
long_bigr = bigr1
-
- for b in short_bigr:
- if (b in long_bigr):
- common += 1.0
- long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+ if(syllable_search):
+ for b in short_bigr:
+ if (b in long_bigr):
+ if long_bigr.index(b) == short_bigr.index(b) :
+ common += 1.0
+ else:
+ dislocation=(long_bigr.index(b) - short_bigr.index(b))/ average
+ if dislocation < 0 :
+ dislocation = dislocation * -1
+ common += 1.0 - dislocation
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+ else:
+ for b in short_bigr:
+ if (b in long_bigr):
+ common += 1.0
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
w = common / average
if(w>=0.6):
@@ -139,30 +138,47 @@ class ApproximateSearch(SilpaModule):
<form action="" method="post">
<textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea>
<br/>
- <input type="text" name="search_key" value="%s"/>
- <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/>
+ <p align="center">
+ Search :<input type="text" name="search_key" value="%s"/>
+ Algorithm : <select id="algorithm" name="algorithm" value="%s" style="width:12em;">
+ <option value="sb">Syllable Bigram</option>
+ <option value="lb">Letter Bigram</option>
+ </select>
</br>
+ <input type="hidden" name="action" value="Approximate Search">
+
+ <input type="submit" id="ApproximateSearch" value="Search" style="width:12em;"/>
+ </p>
</form>
"""
+ algorithm = 'sb'
+ if(form.has_key('algorithm')):
+ algorithm = form['algorithm'].value
if(form.has_key('input_text')):
text = action=form['input_text'].value .decode('utf-8')
if(form.has_key('search_key')):
- key = action=form['search_key'].value .decode('utf-8')
- response=response % (text,key)
+ key =form['search_key'].value .decode('utf-8')
+ response=response % (text,key,algorithm)
words=text.split(" ")
response = response+"<h2>Search Results</h2></hr>"
response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match."
response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>"
else:
response = response+ "Enter a string to search."
- return response % (text,"")
+ return response % (text,"", algorithm)
for word in words:
word=word.strip()
if(word>""):
- response = response+ self.bigram_search(word, key)
+ if word[0]>'0' and word[0]<'Z':
+ response = response+ self.bigram_search(word, key,False)
+ else:
+ if algorithm == 'sb':
+ response = response+ self.bigram_search(word, key, True)
+ else:
+ response = response+ self.bigram_search(word, key, False)
response = response+ "<div style='float: left;'>&nbsp;</div>"
else:
- response=response % ("","")
+ response=response % ("","","sb")
return response
def get_module_name(self):
return "Approximate Search"
diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py
index 706ee77..39c140f 100644..100755
--- a/silpa/modules/syllabalizer/syllabalizer.py
+++ b/silpa/modules/syllabalizer/syllabalizer.py
@@ -51,6 +51,54 @@ class Syllabalizer(SilpaModule):
lst_chars.append(char)
return lst_chars
+ def syllabalize_kn(self,text):
+ signs = [
+ u'\u0c82', u'\u0c83', u'\u0cbd', u'\u0cbe', u'\u0cbf', u'\u0cc0', u'\u0cc1',
+ u'\u0cc2', u'\u0cc3', u'\u0cc4', u'\u0cc6', u'\u0cc7', u'\u0cc8',
+ u'\u0cca', u'\u0ccb', u'\u0ccc', u'\u0ccd']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ halant = u'\u0ccd'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == halant:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+ def syllabalize_bn(self,text):
+ signs = [
+ u'\u0981', u'\u0982', u'\u0983', u'\u09bd', u'\u09be', u'\u09bf', u'\u09c0', u'\u09c1',
+ u'\u09c2', u'\u09c3', u'\u09c4', u'\u09c6', u'\u09c7', u'\u09c8',
+ u'\u09ca', u'\u09cb', u'\u09cc', u'\u09cd', u'\u09d7']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ halant = u'\u09cd'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == halant:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
def syllabalize_hi(self,text):
signs = [
u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941',
@@ -176,11 +224,19 @@ class Syllabalizer(SilpaModule):
def syllabalize(self,text):
mm=ModuleManager()
ld = mm.getModuleInstance("Detect Language")
- lang=ld.detect_lang(text)[text]
+ lang = None
+ try:
+ lang=ld.detect_lang(text)[text]
+ except:
+ pass #FIXME
if(lang=="ml_IN"):
return self.syllabalize_ml(text)
if(lang=="hi_IN"):
return self.syllabalize_hi(text)
+ if(lang=="kn_IN"):
+ return self.syllabalize_kn(text)
+ if(lang=="bn_IN"):
+ return self.syllabalize_bn(text)
if(lang=="en_US"):
return self.syllabalize_en(text)
lst_chars=[]