diff options
Diffstat (limited to 'silpa/modules/inexactsearch/inexactsearch.py')
-rw-r--r-- | silpa/modules/inexactsearch/inexactsearch.py | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py new file mode 100644 index 0000000..0d1f35b --- /dev/null +++ b/silpa/modules/inexactsearch/inexactsearch.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Paralperu +# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com> +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in + + +import sys +import re +from common import * + +class ApproximateSearch(SilpaModule): + + def syllabalize_ml(self, text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?'] + + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + + + def bigram_search(self, str1, str2, syllable_search=False): + """Return approximate string comparator measure (between 0.0 and 1.0) + using bigrams. + USAGE: + score = bigram(str1, str2) + + ARGUMENTS: + str1 The first string + str2 The second string + + DESCRIPTION: + Bigrams are two-character sub-strings contained in a string. For example, + 'peter' contains the bigrams: pe,et,te,er. + + This routine counts the number of common bigrams and divides by the + average number of bigrams. The resulting number is returned. + """ + + # Quick check if the strings are the same - - - - - - - - - - - - - - - - - - + # + if (str1 == str2): + result_string = "<div style='float: left; background-color: green;' title=\" Bigram comparator : string1: %s, string2: %s. Exact Match found" % (str1, str2) + result_string = result_string + "\">"+str1+ "</div>" + return result_string + + bigr1 = [] + bigr2 = [] + + # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - - + # + for i in range(1,len(str1)): + bigr1.append(str1[i-1:i+1]) + for i in range(1,len(str2)): + bigr2.append(str2[i-1:i+1]) + + # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - - + # + average = (len(bigr1)+len(bigr2)) / 2.0 + if (average == 0.0): + return str1 + + # Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # + common = 0.0 + + if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list + short_bigr = bigr1 + long_bigr = bigr2 + else: + short_bigr = bigr2 + long_bigr = bigr1 + + for b in short_bigr: + if (b in long_bigr): + common += 1.0 + long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted + + w = common / average + if(w>=0.6): + result_string = "<div style='float: left; background-color: yellow;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2) + else: + if((w>0.4) & (w<0.6)): + result_string = "<div style='float: left; background-color: grey;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2) + else: + result_string = "<div style='float: left;' title=\" Bigram comparator string1: %s, string2: %s" % (str1, str2) + result_string = result_string + " Number of bigrams in String1: %i" % (len(bigr1)) + result_string = result_string + " Number of bigrams in String2: %i" % (len(bigr2)) + result_string = result_string + " Average: %i" % (average) + result_string = result_string + " Common: %i" % (common) + result_string = result_string + " Final approximate string weight: " + str(w) + result_string = result_string + "\">"+str1+ "</div>" + return result_string + def process(self,form): + response = """ + <h2>Inexact Search</h2></hr> + <p>The search performed by search engines on Indic text is not effective. + It does not take care of the inflective or agglutinative nature of the language. + This application tries to solve that by using an inexact search algorithm based on maximum common bigram algorithm. + + </p> + <p>Enter the text for searching in the below text area. + </p> + <form action="" method="post"> + <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea> + <br/> + <input type="text" name="search_key" value="%s"/> + <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/> + </br> + </form> + """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + if(form.has_key('search_key')): + key = action=form['search_key'].value .decode('utf-8') + response=response % (text,key) + words=text.split(" ") + response = response+"<h2>Search Results</h2></hr>" + response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match." + response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>" + else: + response = response+ "Enter a string to search." + return response % (text,"") + for word in words: + word=word.strip() + if(word>""): + response = response+ self.bigram_search(word, key) + response = response+ "<div style='float: left;'> </div>" + else: + response=response % ("","") + return response + def get_module_name(self): + return "Approximate Search" + def get_info(self): + return "Approximate Search for a string in the given text. Based on bigram search algorithm" + +def getInstance(): + return ApproximateSearch() |