summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPraveen Arimbrathodiyil <pravi.a@gmail.com>2009-05-25 23:17:02 -0700
committerPraveen Arimbrathodiyil <pravi.a@gmail.com>2009-05-25 23:17:02 -0700
commit96c6fb9ec5e604b077ab9f770b99cc021c87d85f (patch)
tree74a944cce1377f86c4cade549673e7be52c3d07d
parent83cdafaa88657c95f20c9d493d37ccebd86c1b80 (diff)
parent30c1b89bec818490131780389c918f8bcfb7aef6 (diff)
downloadAnjaliOldLipi.git-96c6fb9ec5e604b077ab9f770b99cc021c87d85f.tar.gz
AnjaliOldLipi.git-96c6fb9ec5e604b077ab9f770b99cc021c87d85f.tar.xz
AnjaliOldLipi.git-96c6fb9ec5e604b077ab9f770b99cc021c87d85f.zip
Merge branch 'master' of git://git.savannah.nongnu.org/smc
-rwxr-xr-x[-rw-r--r--]silpa/common/silparesponse.py3
-rw-r--r--silpa/doc/credits.html14
-rw-r--r--silpa/doc/todo.html15
-rwxr-xr-x[-rw-r--r--]silpa/index.py31
-rwxr-xr-x[-rw-r--r--]silpa/modules/dictionary/dictionary.py47
-rwxr-xr-x[-rw-r--r--]silpa/modules/inexactsearch/inexactsearch.py102
-rwxr-xr-x[-rw-r--r--]silpa/modules/syllabalizer/syllabalizer.py58
-rw-r--r--silpa/utils/langdetect.py16
-rw-r--r--silpa/utils/silpautils.py18
9 files changed, 226 insertions, 78 deletions
diff --git a/silpa/common/silparesponse.py b/silpa/common/silparesponse.py
index 13cc5ca..5bd8af3 100644..100755
--- a/silpa/common/silparesponse.py
+++ b/silpa/common/silparesponse.py
@@ -18,6 +18,9 @@ class SilpaResponse:
html= "<div id=\"breadcrumb\"><a href=\"http://smc.org.in/silpa\">Home</a> /"
html=html+navPath+"</div>"
self.response=self.response.replace("$$SILPA_BREADCRUMB$$",html)
+ else:
+ html= "<div id=\"breadcrumb\"><a href=\"http://smc.org.in/silpa\">Home</a></div>"
+ self.response=self.response.replace("$$SILPA_BREADCRUMB$$",html)
def setContent(self,value):
if(value):
self.response=self.response.replace("$$SILPA_CONTENT$$",value)
diff --git a/silpa/doc/credits.html b/silpa/doc/credits.html
new file mode 100644
index 0000000..cc836ec
--- /dev/null
+++ b/silpa/doc/credits.html
@@ -0,0 +1,14 @@
+<h2>Credits</h2>
+Many people contributed in direct and indirect way in the development of silpa. This page attempts to list their names.
+<ul>
+<li>Baiju. M, Swathanthra Malayalam Computing for his mlsplit program for using it as a base for syllabalizer for many languages</li>
+<li>Laxminarayan Kamath for testing and feature suggestions</li>
+<li>Rajeesh Nambiar and Nishan Naseer of SMC for their contributions for Font converter</li>
+<li>Guess Language module is based on the python implementation by Kent S Johnson of guesslanguage.cpp by Jacob R Rideout for KDE
+ http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
+ which itself is based on Language::Guess by Maciej Ceglowski
+ http://languid.cantbedone.org/
+
+</li>
+<li>IT Mission, Kerala government for providing English-Malayalam dictionary in GPL license</li>
+</ul>
diff --git a/silpa/doc/todo.html b/silpa/doc/todo.html
new file mode 100644
index 0000000..ece066b
--- /dev/null
+++ b/silpa/doc/todo.html
@@ -0,0 +1,15 @@
+<h2>TODO List</h2>
+<ul>
+<li>Sakavarsham ,Kollavarsham and Other Indian Calenders</li>
+<li>Spellcheck</li>
+<li>Linguistical sorting</li>
+<li>Stemmer</li>
+<li>OCR</li>
+<li>TTS</li>
+<li>Support more fonts for font converter, .doc and .pdf support</li>
+<li>Anagram for remaining languages</li>
+<li>Random Quote for remaining languages</li>
+<li>Crossword generator and solver for Indian Languages</li>
+<li>More dictionaries</li>
+<li>Web APIs</li>
+</ul>
diff --git a/silpa/index.py b/silpa/index.py
index 8a44a15..ccffe09 100644..100755
--- a/silpa/index.py
+++ b/silpa/index.py
@@ -1,4 +1,4 @@
-#!/home/.laboring/smcweb/bin/python
+#!/usr/bin/python
# -*- coding: utf-8 -*-
from common import *
@@ -12,21 +12,28 @@ def index(form):
action=form['action'].value
else:
action=None
+ handleStats()
response=SilpaResponse()
if(action):
module_manager=ModuleManager()
action=action.replace(" ","_")
- module_instance = module_manager.getModuleInstance(action)
- if(module_instance):
- response.setBreadcrumb(module_instance.get_module_name())
- response.setContent(module_instance.process(form))
- response.setErrorMessage(module_instance.get_errormessage())
- response.setSuccessMessage(module_instance.get_successmessage())
- else:
- response.setBreadcrumb("Coming Soon")
- response.setErrorMessage("Module not available")
- response.setContent(None)
- response.setSuccessMessage(None)
+ if action.endswith('.html') or action.endswith('.htm'):
+ response.setBreadcrumb(None)
+ response.setContent(getStaticContent(action))
+ response.setErrorMessage(None)
+ response.setSuccessMessage(None)
+ else:
+ module_instance = module_manager.getModuleInstance(action)
+ if(module_instance):
+ response.setBreadcrumb(module_instance.get_module_name())
+ response.setContent(module_instance.process(form))
+ response.setErrorMessage(module_instance.get_errormessage())
+ response.setSuccessMessage(module_instance.get_successmessage())
+ else:
+ response.setBreadcrumb("Coming Soon")
+ response.setErrorMessage("Module not available")
+ response.setContent(None)
+ response.setSuccessMessage(None)
else: #index
module_manager=ModuleManager()
response.setBreadcrumb("Welcome")
diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py
index 4de0f76..927c06c 100644..100755
--- a/silpa/modules/dictionary/dictionary.py
+++ b/silpa/modules/dictionary/dictionary.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# English Malayalam Dictionary
+# Dictionary
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
@@ -24,49 +24,56 @@
from common import *
import os
-import pickle
+from dictdlib import DictDB
class Dictionary(SilpaModule):
- def lookup_en_ml(self, key):
- key=key.lower()
- self.dictFile=os.path.dirname(__file__) + "/data/dict.dat"
- pickled_dict=open(self.dictFile,'r')
- self.dictionary=pickle.load(pickled_dict)
- meaning=""
- if self.dictionary.has_key(key):
- meaningList=self.dictionary[key]
- for meaning_item in meaningList:
- meaning=meaning+meaning_item.strip() +"<br/>"
- else :
- meaning="No Meaning found"
- return meaning.decode('utf-8')
+ def getdef(self, word, dictionary):
+ dict_dir=os.path.join(os.path.dirname(__file__), 'dictionaries')
+ dictdata=dict_dir+ "/"+dictionary
+ dict=DictDB(dictdata)
+ meanings = dict.getdef(word)
+ meaningstring= ""
+ if (meanings==None):
+ meaningstring = "No definition found"
+ return meaningstring
+ for meaning in meanings:
+ meaningstring += meaning
+ return meaningstring.decode("utf-8")
def process(self,form):
response = """
- <h2>English Malayalam Dictionary</h2></hr>
+ <h2>Dictionary</h2></hr>
<p>Enter the word to lookup in the dictionary
</p>
<form action="" method="post">
- <input type="text" value="%s" name="word"/>
+ <p align="center">
+ Word : <input type="text" value="%s" name="word"/>
+ Dictionary :<select id="dictionary" name="dictionary" style="width:12em;">
+ <option value="freedict-eng-hin">English-Hindi</option>
+ <option value="freedict-eng-mal">English-Malayalam</option>
+ </select>
<input type="hidden" name="action" value="Dictionary">
+ </br>
<input type="submit" id="Find_Meaning" value="Find Meaning" style="width:12em;"/>
</br>
+ </p>
</form>
"""
if(form.has_key('word')):
search_key = form['word'].value
+ dictionary = form['dictionary'].value
response=response % search_key
response = response+"<h2>Search Results</h2></hr>"
if(search_key==None):
response = response+ "Enter a word to find meaning."
else:
- response = response+ self.lookup_en_ml(search_key)
+ response = response+ "<pre> "+ self.getdef(search_key,dictionary) + "</pre> "
else:
response=response % ""
return response
def get_module_name(self):
- return "English Malayalam Dictionary"
+ return "Dictionary"
def get_info(self):
- return "English Malayalam Dictionary. Dictionary is compiled by Kerala state IT Mission"
+ return "Bilingual Dictionaries"
def getInstance():
return Dictionary()
diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py
index 0d1f35b..67e47d9 100644..100755
--- a/silpa/modules/inexactsearch/inexactsearch.py
+++ b/silpa/modules/inexactsearch/inexactsearch.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# Paralperu
+# Approximate Search
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
@@ -28,32 +28,11 @@ from common import *
class ApproximateSearch(SilpaModule):
- def syllabalize_ml(self, text):
- signs = [
- u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
- u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
- u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
- limiters = ['.','\"','\'','`','!',';',',','?']
-
- chandrakkala = u'\u0d4d'
- lst_chars = []
- for char in text:
- if char in limiters:
- lst_chars.append(char)
- elif char in signs:
- lst_chars[-1] = lst_chars[-1] + char
- else:
- try:
- if lst_chars[-1][-1] == chandrakkala:
- lst_chars[-1] = lst_chars[-1] + char
- else:
- lst_chars.append(char)
- except IndexError:
- lst_chars.append(char)
-
- return lst_chars
-
-
+ def syllabalize(self, text):
+ mm=ModuleManager()
+ syllabalizer = mm.getModuleInstance("Syllabalize")
+ return syllabalizer.syllabalize(text)
+
def bigram_search(self, str1, str2, syllable_search=False):
"""Return approximate string comparator measure (between 0.0 and 1.0)
using bigrams.
@@ -84,10 +63,19 @@ class ApproximateSearch(SilpaModule):
# Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - -
#
- for i in range(1,len(str1)):
- bigr1.append(str1[i-1:i+1])
- for i in range(1,len(str2)):
- bigr2.append(str2[i-1:i+1])
+ if(syllable_search):
+ str1_syllables = self. syllabalize(str1)
+ str2_syllables = self. syllabalize(str2)
+ for i in range(1,len(str1_syllables)):
+ bigr1.append(str1_syllables[i-1:i+1])
+ for i in range(1,len(str2_syllables)):
+ bigr2.append(str2_syllables[i-1:i+1])
+ else:
+ for i in range(1,len(str1)):
+ bigr1.append(str1[i-1:i+1])
+ for i in range(1,len(str2)):
+ bigr2.append(str2[i-1:i+1])
+
# Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - -
#
@@ -105,11 +93,22 @@ class ApproximateSearch(SilpaModule):
else:
short_bigr = bigr2
long_bigr = bigr1
-
- for b in short_bigr:
- if (b in long_bigr):
- common += 1.0
- long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+ if(syllable_search):
+ for b in short_bigr:
+ if (b in long_bigr):
+ if long_bigr.index(b) == short_bigr.index(b) :
+ common += 1.0
+ else:
+ dislocation=(long_bigr.index(b) - short_bigr.index(b))/ average
+ if dislocation < 0 :
+ dislocation = dislocation * -1
+ common += 1.0 - dislocation
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+ else:
+ for b in short_bigr:
+ if (b in long_bigr):
+ common += 1.0
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
w = common / average
if(w>=0.6):
@@ -139,30 +138,47 @@ class ApproximateSearch(SilpaModule):
<form action="" method="post">
<textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea>
<br/>
- <input type="text" name="search_key" value="%s"/>
- <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/>
+ <p align="center">
+ Search :<input type="text" name="search_key" value="%s"/>
+ Algorithm : <select id="algorithm" name="algorithm" value="%s" style="width:12em;">
+ <option value="sb">Syllable Bigram</option>
+ <option value="lb">Letter Bigram</option>
+ </select>
</br>
+ <input type="hidden" name="action" value="Approximate Search">
+
+ <input type="submit" id="ApproximateSearch" value="Search" style="width:12em;"/>
+ </p>
</form>
"""
+ algorithm = 'sb'
+ if(form.has_key('algorithm')):
+ algorithm = form['algorithm'].value
if(form.has_key('input_text')):
text = action=form['input_text'].value .decode('utf-8')
if(form.has_key('search_key')):
- key = action=form['search_key'].value .decode('utf-8')
- response=response % (text,key)
+ key =form['search_key'].value .decode('utf-8')
+ response=response % (text,key,algorithm)
words=text.split(" ")
response = response+"<h2>Search Results</h2></hr>"
response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match."
response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>"
else:
response = response+ "Enter a string to search."
- return response % (text,"")
+ return response % (text,"", algorithm)
for word in words:
word=word.strip()
if(word>""):
- response = response+ self.bigram_search(word, key)
+ if word[0]>'0' and word[0]<'Z':
+ response = response+ self.bigram_search(word, key,False)
+ else:
+ if algorithm == 'sb':
+ response = response+ self.bigram_search(word, key, True)
+ else:
+ response = response+ self.bigram_search(word, key, False)
response = response+ "<div style='float: left;'>&nbsp;</div>"
else:
- response=response % ("","")
+ response=response % ("","","sb")
return response
def get_module_name(self):
return "Approximate Search"
diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py
index 706ee77..39c140f 100644..100755
--- a/silpa/modules/syllabalizer/syllabalizer.py
+++ b/silpa/modules/syllabalizer/syllabalizer.py
@@ -51,6 +51,54 @@ class Syllabalizer(SilpaModule):
lst_chars.append(char)
return lst_chars
+ def syllabalize_kn(self,text):
+ signs = [
+ u'\u0c82', u'\u0c83', u'\u0cbd', u'\u0cbe', u'\u0cbf', u'\u0cc0', u'\u0cc1',
+ u'\u0cc2', u'\u0cc3', u'\u0cc4', u'\u0cc6', u'\u0cc7', u'\u0cc8',
+ u'\u0cca', u'\u0ccb', u'\u0ccc', u'\u0ccd']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ halant = u'\u0ccd'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == halant:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+ def syllabalize_bn(self,text):
+ signs = [
+ u'\u0981', u'\u0982', u'\u0983', u'\u09bd', u'\u09be', u'\u09bf', u'\u09c0', u'\u09c1',
+ u'\u09c2', u'\u09c3', u'\u09c4', u'\u09c6', u'\u09c7', u'\u09c8',
+ u'\u09ca', u'\u09cb', u'\u09cc', u'\u09cd', u'\u09d7']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ halant = u'\u09cd'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == halant:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
def syllabalize_hi(self,text):
signs = [
u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941',
@@ -176,11 +224,19 @@ class Syllabalizer(SilpaModule):
def syllabalize(self,text):
mm=ModuleManager()
ld = mm.getModuleInstance("Detect Language")
- lang=ld.detect_lang(text)[text]
+ lang = None
+ try:
+ lang=ld.detect_lang(text)[text]
+ except:
+ pass #FIXME
if(lang=="ml_IN"):
return self.syllabalize_ml(text)
if(lang=="hi_IN"):
return self.syllabalize_hi(text)
+ if(lang=="kn_IN"):
+ return self.syllabalize_kn(text)
+ if(lang=="bn_IN"):
+ return self.syllabalize_bn(text)
if(lang=="en_US"):
return self.syllabalize_en(text)
lst_chars=[]
diff --git a/silpa/utils/langdetect.py b/silpa/utils/langdetect.py
index 727ea6d..2ed7c6f 100644
--- a/silpa/utils/langdetect.py
+++ b/silpa/utils/langdetect.py
@@ -21,27 +21,39 @@ class LangDetect(SilpaModule):
index = 0
while index < length:
letter=word[index]
+ if not letter.isalpha():
+ index=index+1
+ continue
if ((letter >= u'ം') & (letter <=u'൯')):
result_dict[word]= "ml_IN"
+ break;
if ((letter >= u'ঁ') & (letter <= u'৺')):
result_dict[word]= "bn_IN"
+ break
if ((letter >= u'ँ') & (letter <= u'ॿ')):
result_dict[word]= "hi_IN"
+ break
if ((letter >=u'ઁ') & (letter <= u'૱')):
result_dict[word]= "gu_IN"
+ break
if ((letter >= u'ਁ') & (letter <=u'ੴ')):
result_dict[word]= "pa_IN"
+ break
if ((letter >= u'ಂ') & (letter <=u'ೲ')):
- result_dict[word]= "ka_IN"
+ result_dict[word]= "kn_IN"
+ break
if ((letter >= u'ଁ') & (letter <= u'ୱ')):
result_dict[word]= "or_IN"
+ break
if ((letter >=u'ஂ') & (letter <= u'௺')):
result_dict[word]= "ta_IN"
+ break
if ((letter >=u'ఁ') & (letter <= u'౯')):
result_dict[word]= "te_IN"
+ break
if ((letter <= u'z')):
result_dict[word]= "en_US"
-
+ break
index=index+1
word_iter=word_iter+1
return result_dict
diff --git a/silpa/utils/silpautils.py b/silpa/utils/silpautils.py
index 02556af..25a3df8 100644
--- a/silpa/utils/silpautils.py
+++ b/silpa/utils/silpautils.py
@@ -14,6 +14,24 @@ def getModulesList():
if(item.startswith("SILPA_ACTION.")):
action_dict[item.replace("SILPA_ACTION.","")]=conf_dict[item]
return action_dict
+def getStaticContent(page):
+ try:
+ return open("doc/"+page).read()
+ except:
+ return "Could not find the requested page "+ page
+def handleStats():
+ Hits="0"
+ try:
+ InFile = open("count.dat", "r") # Text file with total hits
+ Hits = InFile.readline()
+ except:
+ pass
+ x = int(Hits) + 1
+ h = str(x)
+ OutFile = open("count.dat", "w")
+ OutFile.write(str(x))
+ OutFile.close()
+
def loadConfiguration():
conf_dict={}
conffile = codecs. open("silpa.conf",encoding='utf-8', errors='ignore')