summaryrefslogtreecommitdiffstats
path: root/silpa/modules/inexactsearch/inexactsearch.py
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules/inexactsearch/inexactsearch.py')
-rw-r--r--silpa/modules/inexactsearch/inexactsearch.py173
1 files changed, 173 insertions, 0 deletions
diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py
new file mode 100644
index 0000000..0d1f35b
--- /dev/null
+++ b/silpa/modules/inexactsearch/inexactsearch.py
@@ -0,0 +1,173 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Paralperu
+# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
+# URL: http://www.smc.org.in
+
+
+import sys
+import re
+from common import *
+
+class ApproximateSearch(SilpaModule):
+
+ def syllabalize_ml(self, text):
+ signs = [
+ u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+ u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+ u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+ limiters = ['.','\"','\'','`','!',';',',','?']
+
+ chandrakkala = u'\u0d4d'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == chandrakkala:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+
+
+ def bigram_search(self, str1, str2, syllable_search=False):
+ """Return approximate string comparator measure (between 0.0 and 1.0)
+ using bigrams.
+ USAGE:
+ score = bigram(str1, str2)
+
+ ARGUMENTS:
+ str1 The first string
+ str2 The second string
+
+ DESCRIPTION:
+ Bigrams are two-character sub-strings contained in a string. For example,
+ 'peter' contains the bigrams: pe,et,te,er.
+
+ This routine counts the number of common bigrams and divides by the
+ average number of bigrams. The resulting number is returned.
+ """
+
+ # Quick check if the strings are the same - - - - - - - - - - - - - - - - - -
+ #
+ if (str1 == str2):
+ result_string = "<div style='float: left; background-color: green;' title=\" Bigram comparator : string1: %s, string2: %s. Exact Match found" % (str1, str2)
+ result_string = result_string + "\">"+str1+ "</div>"
+ return result_string
+
+ bigr1 = []
+ bigr2 = []
+
+ # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - -
+ #
+ for i in range(1,len(str1)):
+ bigr1.append(str1[i-1:i+1])
+ for i in range(1,len(str2)):
+ bigr2.append(str2[i-1:i+1])
+
+ # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - -
+ #
+ average = (len(bigr1)+len(bigr2)) / 2.0
+ if (average == 0.0):
+ return str1
+
+ # Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ #
+ common = 0.0
+
+ if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list
+ short_bigr = bigr1
+ long_bigr = bigr2
+ else:
+ short_bigr = bigr2
+ long_bigr = bigr1
+
+ for b in short_bigr:
+ if (b in long_bigr):
+ common += 1.0
+ long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
+
+ w = common / average
+ if(w>=0.6):
+ result_string = "<div style='float: left; background-color: yellow;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2)
+ else:
+ if((w>0.4) & (w<0.6)):
+ result_string = "<div style='float: left; background-color: grey;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2)
+ else:
+ result_string = "<div style='float: left;' title=\" Bigram comparator string1: %s, string2: %s" % (str1, str2)
+ result_string = result_string + " Number of bigrams in String1: %i" % (len(bigr1))
+ result_string = result_string + " Number of bigrams in String2: %i" % (len(bigr2))
+ result_string = result_string + " Average: %i" % (average)
+ result_string = result_string + " Common: %i" % (common)
+ result_string = result_string + " Final approximate string weight: " + str(w)
+ result_string = result_string + "\">"+str1+ "</div>"
+ return result_string
+ def process(self,form):
+ response = """
+ <h2>Inexact Search</h2></hr>
+ <p>The search performed by search engines on Indic text is not effective.
+ It does not take care of the inflective or agglutinative nature of the language.
+ This application tries to solve that by using an inexact search algorithm based on maximum common bigram algorithm.
+
+ </p>
+ <p>Enter the text for searching in the below text area.
+ </p>
+ <form action="" method="post">
+ <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea>
+ <br/>
+ <input type="text" name="search_key" value="%s"/>
+ <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/>
+ </br>
+ </form>
+ """
+ if(form.has_key('input_text')):
+ text = action=form['input_text'].value .decode('utf-8')
+ if(form.has_key('search_key')):
+ key = action=form['search_key'].value .decode('utf-8')
+ response=response % (text,key)
+ words=text.split(" ")
+ response = response+"<h2>Search Results</h2></hr>"
+ response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match."
+ response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>"
+ else:
+ response = response+ "Enter a string to search."
+ return response % (text,"")
+ for word in words:
+ word=word.strip()
+ if(word>""):
+ response = response+ self.bigram_search(word, key)
+ response = response+ "<div style='float: left;'>&nbsp;</div>"
+ else:
+ response=response % ("","")
+ return response
+ def get_module_name(self):
+ return "Approximate Search"
+ def get_info(self):
+ return "Approximate Search for a string in the given text. Based on bigram search algorithm"
+
+def getInstance():
+ return ApproximateSearch()