From 8ce8904c7366944801f426af20dec665a2f410cf Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Tue, 24 Mar 2009 22:58:13 +0530 Subject: Hyphenator module-adding to base template --- silpa/modules/hyphenator/hyphenator.py | 303 +++++++++++++++++++++++++++++---- silpa/silpa.py | 4 + silpa/templates/base.py | 6 +- 3 files changed, 277 insertions(+), 36 deletions(-) mode change 100644 => 100755 silpa/modules/hyphenator/hyphenator.py mode change 100644 => 100755 silpa/silpa.py (limited to 'silpa') diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py old mode 100644 new mode 100755 index 9efe8de..288af50 --- a/silpa/modules/hyphenator/hyphenator.py +++ b/silpa/modules/hyphenator/hyphenator.py @@ -1,35 +1,272 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -# Malayalam Rule Based Normalizer -# Copyright 2008 Santhosh Thottingal , -# http://www.smc.org.in -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Library General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com -# URL: http://www.smc.org.in - - -import string - -class Hyphenator: - - def __init__(self): - self.langauge ="" - - def Normalize(self, word): - - return word - +""" + +This is a Pure Python module to hyphenate text. + +It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files, +that must be installed separately. + +In the future it's maybe nice if dictionaries could be distributed together with +this module, in a slightly prepared form, like in Ruby's Text::Hyphen. + +Wilbert Berendsen, March 2008 +info@wilbertberendsen.nl + +License: LGPL. + +""" + +import sys +import re +from utils import * +__all__ = ("Hyphenator") + +# cache of per-file Hyph_dict objects +hdcache = {} + +# precompile some stuff +parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub +parse = re.compile(r'(\d?)(\D?)').findall + +def hexrepl(matchObj): + return unichr(int(matchObj.group(1), 16)) + + +class parse_alt(object): + """ + Parse nonstandard hyphen pattern alternative. + The instance returns a special int with data about the current position + in the pattern when called with an odd value. + """ + def __init__(self, pat, alt): + alt = alt.split(',') + self.change = alt[0] + if len(alt) > 2: + self.index = int(alt[1]) + self.cut = int(alt[2]) + 1 + else: + self.index = 1 + self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1 + if pat.startswith('.'): + self.index += 1 + + def __call__(self, val): + self.index -= 1 + val = int(val) + if val & 1: + return dint(val, (self.change, self.index, self.cut)) + else: + return val + + +class dint(int): + """ + Just an int some other data can be stuck to in a data attribute. + Call with ref=other to use the data from the other dint. + """ + def __new__(cls, value, data=None, ref=None): + obj = int.__new__(cls, value) + if ref and type(ref) == dint: + obj.data = ref.data + else: + obj.data = data + return obj + + +class Hyph_dict(object): + """ + Reads a hyph_*.dic file and stores the hyphenation patterns. + Parameters: + -filename : filename of hyph_*.dic to read + """ + def __init__(self, filename): + self.patterns = {} + f = open(filename) + charset = f.readline().strip() + if charset.startswith('charset '): + charset = charset[8:].strip() + + for pat in f: + pat = pat.decode(charset).strip() + if not pat or pat[0] == '%': continue + # replace ^^hh with the real character + pat = parse_hex(hexrepl, pat) + # read nonstandard hyphen alternatives + if '/' in pat: + pat, alt = pat.split('/', 1) + factory = parse_alt(pat, alt) + else: + factory = int + tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)]) + # if only zeros, skip this pattern + if max(value) == 0: continue + # chop zeros from beginning and end, and store start offset. + start, end = 0, len(value) + while not value[start]: start += 1 + while not value[end-1]: end -= 1 + self.patterns[''.join(tag)] = start, value[start:end] + f.close() + self.cache = {} + self.maxlen = max(map(len, self.patterns.keys())) + + def positions(self, word): + """ + Returns a list of positions where the word can be hyphenated. + E.g. for the dutch word 'lettergrepen' this method returns + the list [3, 6, 9]. + + Each position is a 'data int' (dint) with a data attribute. + If the data attribute is not None, it contains a tuple with + information about nonstandard hyphenation at that point: + (change, index, cut) + + change: is a string like 'ff=f', that describes how hyphenation + should take place. + index: where to substitute the change, counting from the current + point + cut: how many characters to remove while substituting the nonstandard + hyphenation + """ + word = word.lower() + points = self.cache.get(word) + if points is None: + prepWord = '.%s.' % word + res = [0] * (len(prepWord) + 1) + for i in range(len(prepWord) - 1): + for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1): + p = self.patterns.get(prepWord[i:j]) + if p: + offset, value = p + s = slice(i + offset, i + offset + len(value)) + res[s] = map(max, value, res[s]) + + points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2] + self.cache[word] = points + return points + + +class Hyphenator(object): + """ + Reads a hyph_*.dic file and stores the hyphenation patterns. + Provides methods to hyphenate strings in various ways. + Parameters: + -filename : filename of hyph_*.dic to read + -left: make the first syllabe not shorter than this + -right: make the last syllabe not shorter than this + -cache: if true (default), use a cached copy of the dic file, if possible + + left and right may also later be changed: + h = Hyphenator(file) + h.left = 1 + """ + def __init__(self, filename, left=2, right=2, cache=True): + self.left = left + self.right = right + if not cache or filename not in hdcache: + hdcache[filename] = Hyph_dict(filename) + self.hd = hdcache[filename] + + def positions(self, word): + """ + Returns a list of positions where the word can be hyphenated. + See also Hyph_dict.positions. The points that are too far to + the left or right are removed. + """ + right = len(word) - self.right + return [i for i in self.hd.positions(word) if self.left <= i <= right] + + def iterate(self, word): + """ + Iterate over all hyphenation possibilities, the longest first. + """ + if isinstance(word, str): + word = word.decode('latin1') + for p in reversed(self.positions(word)): + if p.data: + # get the nonstandard hyphenation data + change, index, cut = p.data + if word.isupper(): + change = change.upper() + c1, c2 = change.split('=') + yield word[:p+index] + c1, c2 + word[p+index+cut:] + else: + yield word[:p], word[p:] + + def wrap(self, word, width, hyphen='-'): + """ + Return the longest possible first part and the last part of the + hyphenated word. The first part has the hyphen already attached. + Returns None, if there is no hyphenation point before width, or + if the word could not be hyphenated. + """ + width -= len(hyphen) + for w1, w2 in self.iterate(word): + if len(w1) <= width: + return w1 + hyphen, w2 + + def inserted(self, word, hyphen='-'): + """ + Returns the word as a string with all the possible hyphens inserted. + E.g. for the dutch word 'lettergrepen' this method returns + the string 'let-ter-gre-pen'. The hyphen string to use can be + given as the second parameter, that defaults to '-'. + """ + if isinstance(word, str): + word = word.decode('latin1') + l = list(word) + for p in reversed(self.positions(word)): + if p.data: + # get the nonstandard hyphenation data + change, index, cut = p.data + if word.isupper(): + change = change.upper() + l[p + index : p + index + cut] = change.replace('=', hyphen) + else: + l.insert(p, hyphen) + return ''.join(l) + + __call__ = iterate + + +class Hyphenate: + + def hyphenate(self,text): + ldetector=LangDetect() + detected_lang_dict =ldetector.detect_lang(text) + response="" + for key in detected_lang_dict: + hyphenator=Hyphenator("./rules/"+detected_lang_dict[key],left=1, right=1) + response=response+hyphenator.inserted(key) + return response + def process(self, form): + response = """ +

Hyphenation

+

Enter the text for hyphenation in the below text area. + You can give the text in any language and even with mixed language +

+
+ + + +
+
+ """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + response=response % text + text = self.hyphenate(text) + response = response+"

Language Detection Results

" + response = response+"" + else: + response=response % "" + return response +if __name__ == "__main__": + h=Hyphenate() + dict_file = sys.argv[1] + word = sys.argv[2].decode('utf-8') + + h = Hyphenator(dict_file, left=1, right=1) + for i in h(word): + print i + diff --git a/silpa/silpa.py b/silpa/silpa.py old mode 100644 new mode 100755 index a658877..377f1bc --- a/silpa/silpa.py +++ b/silpa/silpa.py @@ -21,6 +21,10 @@ def index(form): response.setBreadcrumb(action) ldetector=LangDetect() response. setContent(ldetector.process(form)) + if(action=="Hyphenate"): + response.setBreadcrumb(action) + hyphenator=Hyphenate() + response. setContent(hyphenator.process(form)) response.setBreadcrumb("Coming Soon") response.setContent("Not implemented in current version...!") return response.toString(); diff --git a/silpa/templates/base.py b/silpa/templates/base.py index 00d6586..c4be9c4 100644 --- a/silpa/templates/base.py +++ b/silpa/templates/base.py @@ -63,7 +63,7 @@ http://creativecommons.org/licenses/GPL/2.0/