summaryrefslogtreecommitdiffstats
path: root/silpa
diff options
context:
space:
mode:
authorSanthosh Thottingal <santhosh.thottingal@gmail.com>2009-03-24 22:58:13 +0530
committerSanthosh Thottingal <santhosh.thottingal@gmail.com>2009-03-24 22:58:13 +0530
commit8ce8904c7366944801f426af20dec665a2f410cf (patch)
treeabaf461423ec91606173fa4d5c968e072ae69ac7 /silpa
parent8de2fcccd538d72730a4b34cf75f1b44294f8435 (diff)
downloadAnjaliOldLipi.git-8ce8904c7366944801f426af20dec665a2f410cf.tar.gz
AnjaliOldLipi.git-8ce8904c7366944801f426af20dec665a2f410cf.tar.xz
AnjaliOldLipi.git-8ce8904c7366944801f426af20dec665a2f410cf.zip
Hyphenator module-adding to base template
Diffstat (limited to 'silpa')
-rwxr-xr-x[-rw-r--r--]silpa/modules/hyphenator/hyphenator.py303
-rwxr-xr-x[-rw-r--r--]silpa/silpa.py4
-rw-r--r--silpa/templates/base.py6
3 files changed, 277 insertions, 36 deletions
diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py
index 9efe8de..288af50 100644..100755
--- a/silpa/modules/hyphenator/hyphenator.py
+++ b/silpa/modules/hyphenator/hyphenator.py
@@ -1,35 +1,272 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
-# Malayalam Rule Based Normalizer
-# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>,
-# http://www.smc.org.in
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
-# URL: http://www.smc.org.in
-
-
-import string
-
-class Hyphenator:
-
- def __init__(self):
- self.langauge =""
-
- def Normalize(self, word):
-
- return word
-
+"""
+
+This is a Pure Python module to hyphenate text.
+
+It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files,
+that must be installed separately.
+
+In the future it's maybe nice if dictionaries could be distributed together with
+this module, in a slightly prepared form, like in Ruby's Text::Hyphen.
+
+Wilbert Berendsen, March 2008
+info@wilbertberendsen.nl
+
+License: LGPL.
+
+"""
+
+import sys
+import re
+from utils import *
+__all__ = ("Hyphenator")
+
+# cache of per-file Hyph_dict objects
+hdcache = {}
+
+# precompile some stuff
+parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
+parse = re.compile(r'(\d?)(\D?)').findall
+
+def hexrepl(matchObj):
+ return unichr(int(matchObj.group(1), 16))
+
+
+class parse_alt(object):
+ """
+ Parse nonstandard hyphen pattern alternative.
+ The instance returns a special int with data about the current position
+ in the pattern when called with an odd value.
+ """
+ def __init__(self, pat, alt):
+ alt = alt.split(',')
+ self.change = alt[0]
+ if len(alt) > 2:
+ self.index = int(alt[1])
+ self.cut = int(alt[2]) + 1
+ else:
+ self.index = 1
+ self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1
+ if pat.startswith('.'):
+ self.index += 1
+
+ def __call__(self, val):
+ self.index -= 1
+ val = int(val)
+ if val & 1:
+ return dint(val, (self.change, self.index, self.cut))
+ else:
+ return val
+
+
+class dint(int):
+ """
+ Just an int some other data can be stuck to in a data attribute.
+ Call with ref=other to use the data from the other dint.
+ """
+ def __new__(cls, value, data=None, ref=None):
+ obj = int.__new__(cls, value)
+ if ref and type(ref) == dint:
+ obj.data = ref.data
+ else:
+ obj.data = data
+ return obj
+
+
+class Hyph_dict(object):
+ """
+ Reads a hyph_*.dic file and stores the hyphenation patterns.
+ Parameters:
+ -filename : filename of hyph_*.dic to read
+ """
+ def __init__(self, filename):
+ self.patterns = {}
+ f = open(filename)
+ charset = f.readline().strip()
+ if charset.startswith('charset '):
+ charset = charset[8:].strip()
+
+ for pat in f:
+ pat = pat.decode(charset).strip()
+ if not pat or pat[0] == '%': continue
+ # replace ^^hh with the real character
+ pat = parse_hex(hexrepl, pat)
+ # read nonstandard hyphen alternatives
+ if '/' in pat:
+ pat, alt = pat.split('/', 1)
+ factory = parse_alt(pat, alt)
+ else:
+ factory = int
+ tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)])
+ # if only zeros, skip this pattern
+ if max(value) == 0: continue
+ # chop zeros from beginning and end, and store start offset.
+ start, end = 0, len(value)
+ while not value[start]: start += 1
+ while not value[end-1]: end -= 1
+ self.patterns[''.join(tag)] = start, value[start:end]
+ f.close()
+ self.cache = {}
+ self.maxlen = max(map(len, self.patterns.keys()))
+
+ def positions(self, word):
+ """
+ Returns a list of positions where the word can be hyphenated.
+ E.g. for the dutch word 'lettergrepen' this method returns
+ the list [3, 6, 9].
+
+ Each position is a 'data int' (dint) with a data attribute.
+ If the data attribute is not None, it contains a tuple with
+ information about nonstandard hyphenation at that point:
+ (change, index, cut)
+
+ change: is a string like 'ff=f', that describes how hyphenation
+ should take place.
+ index: where to substitute the change, counting from the current
+ point
+ cut: how many characters to remove while substituting the nonstandard
+ hyphenation
+ """
+ word = word.lower()
+ points = self.cache.get(word)
+ if points is None:
+ prepWord = '.%s.' % word
+ res = [0] * (len(prepWord) + 1)
+ for i in range(len(prepWord) - 1):
+ for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1):
+ p = self.patterns.get(prepWord[i:j])
+ if p:
+ offset, value = p
+ s = slice(i + offset, i + offset + len(value))
+ res[s] = map(max, value, res[s])
+
+ points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2]
+ self.cache[word] = points
+ return points
+
+
+class Hyphenator(object):
+ """
+ Reads a hyph_*.dic file and stores the hyphenation patterns.
+ Provides methods to hyphenate strings in various ways.
+ Parameters:
+ -filename : filename of hyph_*.dic to read
+ -left: make the first syllabe not shorter than this
+ -right: make the last syllabe not shorter than this
+ -cache: if true (default), use a cached copy of the dic file, if possible
+
+ left and right may also later be changed:
+ h = Hyphenator(file)
+ h.left = 1
+ """
+ def __init__(self, filename, left=2, right=2, cache=True):
+ self.left = left
+ self.right = right
+ if not cache or filename not in hdcache:
+ hdcache[filename] = Hyph_dict(filename)
+ self.hd = hdcache[filename]
+
+ def positions(self, word):
+ """
+ Returns a list of positions where the word can be hyphenated.
+ See also Hyph_dict.positions. The points that are too far to
+ the left or right are removed.
+ """
+ right = len(word) - self.right
+ return [i for i in self.hd.positions(word) if self.left <= i <= right]
+
+ def iterate(self, word):
+ """
+ Iterate over all hyphenation possibilities, the longest first.
+ """
+ if isinstance(word, str):
+ word = word.decode('latin1')
+ for p in reversed(self.positions(word)):
+ if p.data:
+ # get the nonstandard hyphenation data
+ change, index, cut = p.data
+ if word.isupper():
+ change = change.upper()
+ c1, c2 = change.split('=')
+ yield word[:p+index] + c1, c2 + word[p+index+cut:]
+ else:
+ yield word[:p], word[p:]
+
+ def wrap(self, word, width, hyphen='-'):
+ """
+ Return the longest possible first part and the last part of the
+ hyphenated word. The first part has the hyphen already attached.
+ Returns None, if there is no hyphenation point before width, or
+ if the word could not be hyphenated.
+ """
+ width -= len(hyphen)
+ for w1, w2 in self.iterate(word):
+ if len(w1) <= width:
+ return w1 + hyphen, w2
+
+ def inserted(self, word, hyphen='-'):
+ """
+ Returns the word as a string with all the possible hyphens inserted.
+ E.g. for the dutch word 'lettergrepen' this method returns
+ the string 'let-ter-gre-pen'. The hyphen string to use can be
+ given as the second parameter, that defaults to '-'.
+ """
+ if isinstance(word, str):
+ word = word.decode('latin1')
+ l = list(word)
+ for p in reversed(self.positions(word)):
+ if p.data:
+ # get the nonstandard hyphenation data
+ change, index, cut = p.data
+ if word.isupper():
+ change = change.upper()
+ l[p + index : p + index + cut] = change.replace('=', hyphen)
+ else:
+ l.insert(p, hyphen)
+ return ''.join(l)
+
+ __call__ = iterate
+
+
+class Hyphenate:
+
+ def hyphenate(self,text):
+ ldetector=LangDetect()
+ detected_lang_dict =ldetector.detect_lang(text)
+ response=""
+ for key in detected_lang_dict:
+ hyphenator=Hyphenator("./rules/"+detected_lang_dict[key],left=1, right=1)
+ response=response+hyphenator.inserted(key)
+ return response
+ def process(self, form):
+ response = """
+ <h2>Hyphenation</h2></hr>
+ <p>Enter the text for hyphenation in the below text area.
+ You can give the text in any language and even with mixed language
+ </p>
+ <form action="" method="post">
+ <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea>
+ <input type="submit" id="Hyphenate" value="Hyphenate" name="action" style="width:12em;"/>
+ <input type="reset" value="Clear" style="width:12em;"/>
+ </br>
+ </form>
+ """
+ if(form.has_key('input_text')):
+ text = action=form['input_text'].value .decode('utf-8')
+ response=response % text
+ text = self.hyphenate(text)
+ response = response+"<h2>Language Detection Results</h2></hr>"
+ response = response+"<textarea cols='100' rows='25' name='output_text' id='output_text'>"+text+"</textarea>"
+ else:
+ response=response % ""
+ return response
+if __name__ == "__main__":
+ h=Hyphenate()
+ dict_file = sys.argv[1]
+ word = sys.argv[2].decode('utf-8')
+
+ h = Hyphenator(dict_file, left=1, right=1)
+ for i in h(word):
+ print i
+
diff --git a/silpa/silpa.py b/silpa/silpa.py
index a658877..377f1bc 100644..100755
--- a/silpa/silpa.py
+++ b/silpa/silpa.py
@@ -21,6 +21,10 @@ def index(form):
response.setBreadcrumb(action)
ldetector=LangDetect()
response. setContent(ldetector.process(form))
+ if(action=="Hyphenate"):
+ response.setBreadcrumb(action)
+ hyphenator=Hyphenate()
+ response. setContent(hyphenator.process(form))
response.setBreadcrumb("Coming Soon")
response.setContent("Not implemented in current version...!")
return response.toString();
diff --git a/silpa/templates/base.py b/silpa/templates/base.py
index 00d6586..c4be9c4 100644
--- a/silpa/templates/base.py
+++ b/silpa/templates/base.py
@@ -63,7 +63,7 @@ http://creativecommons.org/licenses/GPL/2.0/
<ul id="nav-secondary">
<li class="first"><a href="?action=Detect+Language">Language Detection</a></li>
- <li><a href="?action=spellcheck">Spellcheck</a></li>
+ <li><a href="?action=Spellcheck">Spellcheck</a></li>
<li class="active"><a href="#">Font Conversion</a>
<ul>
<li class="first"><a href="?action=To+Unicode">Ascii to Unicode</a></li>
@@ -72,8 +72,8 @@ http://creativecommons.org/licenses/GPL/2.0/
</li>
<li><a href="#">Lemmatizer</a></li>
<li><a href="#">Normalizer</a></li>
-
- <li class="last"><a href="#">Sort</a></li>
+ <li class="last"><a href="?action=Sort"">Sort</a></li>
+ <li class="last"><a href="?action=Hyphenate">Hyphenate</a></li>
</ul>
</div>