diff options
Diffstat (limited to 'silpa/modules/hyphenator/hyphenator.py')
-rw-r--r-- | silpa/modules/hyphenator/hyphenator.py | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py new file mode 100644 index 0000000..889aee3 --- /dev/null +++ b/silpa/modules/hyphenator/hyphenator.py @@ -0,0 +1,281 @@ +""" + +This is a Pure Python module to hyphenate text. + +It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files, +that must be installed separately. + +In the future it's maybe nice if dictionaries could be distributed together with +this module, in a slightly prepared form, like in Ruby's Text::Hyphen. + +Wilbert Berendsen, March 2008 +info@wilbertberendsen.nl + +License: LGPL. + +""" + +import sys +import re +from common import * +#__all__ = ("Hyphenator") + +# cache of per-file Hyph_dict objects +hdcache = {} + +# precompile some stuff +parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub +parse = re.compile(r'(\d?)(\D?)').findall + +def hexrepl(matchObj): + return unichr(int(matchObj.group(1), 16)) + + +class parse_alt(object): + """ + Parse nonstandard hyphen pattern alternative. + The instance returns a special int with data about the current position + in the pattern when called with an odd value. + """ + def __init__(self, pat, alt): + alt = alt.split(',') + self.change = alt[0] + if len(alt) > 2: + self.index = int(alt[1]) + self.cut = int(alt[2]) + 1 + else: + self.index = 1 + self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1 + if pat.startswith('.'): + self.index += 1 + + def __call__(self, val): + self.index -= 1 + val = int(val) + if val & 1: + return dint(val, (self.change, self.index, self.cut)) + else: + return val + + +class dint(int): + """ + Just an int some other data can be stuck to in a data attribute. + Call with ref=other to use the data from the other dint. + """ + def __new__(cls, value, data=None, ref=None): + obj = int.__new__(cls, value) + if ref and type(ref) == dint: + obj.data = ref.data + else: + obj.data = data + return obj + + +class Hyph_dict(object): + """ + Reads a hyph_*.dic file and stores the hyphenation patterns. + Parameters: + -filename : filename of hyph_*.dic to read + """ + def __init__(self, filename): + self.patterns = {} + f = open(filename) + charset = f.readline().strip() + if charset.startswith('charset '): + charset = charset[8:].strip() + + for pat in f: + pat = pat.decode(charset).strip() + if not pat or pat[0] == '%': continue + # replace ^^hh with the real character + pat = parse_hex(hexrepl, pat) + # read nonstandard hyphen alternatives + if '/' in pat: + pat, alt = pat.split('/', 1) + factory = parse_alt(pat, alt) + else: + factory = int + tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)]) + # if only zeros, skip this pattern + if max(value) == 0: continue + # chop zeros from beginning and end, and store start offset. + start, end = 0, len(value) + while not value[start]: start += 1 + while not value[end-1]: end -= 1 + self.patterns[''.join(tag)] = start, value[start:end] + f.close() + self.cache = {} + self.maxlen = max(map(len, self.patterns.keys())) + + def positions(self, word): + """ + Returns a list of positions where the word can be hyphenated. + E.g. for the dutch word 'lettergrepen' this method returns + the list [3, 6, 9]. + + Each position is a 'data int' (dint) with a data attribute. + If the data attribute is not None, it contains a tuple with + information about nonstandard hyphenation at that point: + (change, index, cut) + + change: is a string like 'ff=f', that describes how hyphenation + should take place. + index: where to substitute the change, counting from the current + point + cut: how many characters to remove while substituting the nonstandard + hyphenation + """ + word = word.lower() + points = self.cache.get(word) + if points is None: + prepWord = '.%s.' % word + res = [0] * (len(prepWord) + 1) + for i in range(len(prepWord) - 1): + for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1): + p = self.patterns.get(prepWord[i:j]) + if p: + offset, value = p + s = slice(i + offset, i + offset + len(value)) + res[s] = map(max, value, res[s]) + + points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2] + self.cache[word] = points + return points + + +class Hyphenator(SilpaModule): + """ + Reads a hyph_*.dic file and stores the hyphenation patterns. + Provides methods to hyphenate strings in various ways. + Parameters: + -filename : filename of hyph_*.dic to read + -left: make the first syllabe not shorter than this + -right: make the last syllabe not shorter than this + -cache: if true (default), use a cached copy of the dic file, if possible + + left and right may also later be changed: + h = Hyphenator(file) + h.left = 1 + """ + #self.left=2 + #def __init__(self, left=2, right=2, cache=True): + left = 2 + right = 2 + + def loadHyphDict(self,lang, cache=True): + filename="./modules/hyphenator/rules/hyph_"+lang+".dic" + if not cache or filename not in hdcache: + hdcache[filename] = Hyph_dict(filename) + self.hd = hdcache[filename] + def positions(self, word): + """ + Returns a list of positions where the word can be hyphenated. + See also Hyph_dict.positions. The points that are too far to + the left or right are removed. + """ + right = len(word) - self.right + return [i for i in self.hd.positions(word) if self.left <= i <= right] + + def iterate(self, word): + """ + Iterate over all hyphenation possibilities, the longest first. + """ + if isinstance(word, str): + word = word.decode('latin1') + for p in reversed(self.positions(word)): + if p.data: + # get the nonstandard hyphenation data + change, index, cut = p.data + if word.isupper(): + change = change.upper() + c1, c2 = change.split('=') + yield word[:p+index] + c1, c2 + word[p+index+cut:] + else: + yield word[:p], word[p:] + + def wrap(self, word, width, hyphen='-'): + """ + Return the longest possible first part and the last part of the + hyphenated word. The first part has the hyphen already attached. + Returns None, if there is no hyphenation point before width, or + if the word could not be hyphenated. + """ + width -= len(hyphen) + for w1, w2 in self.iterate(word): + if len(w1) <= width: + return w1 + hyphen, w2 + + def inserted(self, word, hyphen='-'): + """ + Returns the word as a string with all the possible hyphens inserted. + E.g. for the dutch word 'lettergrepen' this method returns + the string 'let-ter-gre-pen'. The hyphen string to use can be + given as the second parameter, that defaults to '-'. + """ + if isinstance(word, str): + word = word.decode('latin1') + l = list(word) + for p in reversed(self.positions(word)): + if p.data: + # get the nonstandard hyphenation data + change, index, cut = p.data + if word.isupper(): + change = change.upper() + l[p + index : p + index + cut] = change.replace('=', hyphen) + else: + l.insert(p, hyphen) + return ''.join(l) + def process(self,form): + response = """ + <h2>Hyphenate Text</h2></hr> + <p>Enter the text for hyphenation in the below text area. + Language of each word will be detected. + You can give the text in any language and even with mixed language + </p> + <form action="" method="post"> + <textarea name='input_text' id='id1'>%s</textarea> + <input type="submit" id="Hyphenate" value="Hyphenate" name="action" style="width:12em;"/> + </br> + </form> + """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + response=response % text + words=text.split(" ") + response = response+"<h2>Hyphenation Results</h2></hr>" + response = response+"<table class=\"table1\"><tr><th>Word</th><th>Hyphenated Word</th></tr>" + for word in words: + word=word.strip() + if(word>""): + mm=ModuleManager() + ld = mm.getModuleInstance("Detect Language") + lang=ld.detect_lang(word)[word] + self.loadHyphDict(lang) + hyph_word = self.inserted(word) + response = response+"<tr><td>"+word+"</td><td>"+hyph_word+"</td></tr>" + response = response+"</table> " + else: + response=response % "" + return response + def get_module_name(self): + return "Hyphentator" + def get_info(self): + return "Hyphenates each word in the text in all possible positions" + +def getInstance(): + return Hyphenator() + +# __call__ = iterate + + +if __name__ == "__main__": + + dict_file = sys.argv[1] + word = sys.argv[2].decode('latin1') + + h = Hyphenator(dict_file, left=1, right=1) + + for i in h(word): + print i + |