diff options
Diffstat (limited to 'silpa/modules/syllabalizer/syllabalizer.py')
-rw-r--r-- | silpa/modules/syllabalizer/syllabalizer.py | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py new file mode 100644 index 0000000..deed058 --- /dev/null +++ b/silpa/modules/syllabalizer/syllabalizer.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +"""mlsplit - Split Malayalam words into letters + +This script splits Malayalam words into letters. +Ref: http://tinyurl.com/3v729s + + + +Copyright (C) 2008 Baiju M <baiju.m.mail AT gmail.com> + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" + +import sys +import re +import codecs +from langdetect import LangDetect +class Syllabalizer: + def syllabalize_ml(self,text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + def syllabalize_hi(self,text): + signs = [ + u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941', + u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948', + u'\u094a', u'\u094b', u'\u094c', u'\u094d'] + + chandrakkala = u'\u094d' + lst_chars = [] + for char in text: + if char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + #Source: http://www.python-forum.org/pythonforum/viewtopic.php?f=14&t=5810#p42091 + #Author: Cabu + def syllabalize_en(self,text): + text = " " + text + " " + vowel_list = ['a', 'e', 'i', 'o', 'u', 'y'] + vowel_pairs = ['ai', 'au', 'aw', 'ee','ea', 'oa', 'oi', 'ou', 'oo', 'ow', 'oy', 'uu'] + consonant_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] + consonant_blends = ['bl', 'br', 'ch', 'chr', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gr', 'kn', 'pl', 'pr', + 'sc', 'sh', 'sk', 'sl', 'sm', 'sn', 'sp', 'spr', 'squ', 'st', 'str', 'sw', + 'th', 'tr', 'thr', 'nt', 'wh'] + + # Cut numbers in digits + p = re.compile ("([0-9])([0-9])", re.IGNORECASE) + for i in range (2): + text = p.sub ("\\1#\\2", text) + + # Cut i / vowel (- o) / consonant + p = re.compile ("i([aeiuy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE) + text = p.sub ("i+\\1+\\2", text) + + # Cut the / vowel / consonant + p = re.compile ("the([aeiouy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE) + text = p.sub ("the+\\1+\\2", text) + + # Cut vowel / vowel except for pairs + position = 0 + while position < len (text)-1: + if text [position] in vowel_list and text [position+1] in vowel_list: + if not (text [position:position+2] in vowel_pairs): + if not (text [position-1:position+3] in ["tion", "dual", "nion", "quir", "tiou"]): + text = text [:position+1] + "_" + text [position+1:] + position = position + 1 + + # Cut consonant / consonant (ll, mm, ...) + p = re.compile ("([bcdfghjklmnpqrstvwxz])\\1([^ ])", re.IGNORECASE) + text = p.sub ("\\1-\\1\\2", text) + + # Cut vowel / consonant vowel + start = 0 + end = 0 + while start < len (text)-1: + if text [start] in vowel_list and text [start+1] in consonant_list: + end = start + 1 + while end <= len (text)-1 and text [end] in consonant_list: + end = end + 1 + if end <= len (text)-1 and (text [start+1:end] in consonant_list or text [start+1:end] in consonant_blends) and text [end] in vowel_list and text [end:end+2] <> "e ": + text = text [:start+1] + "/" + text [start+1:] + start = start + 1 + + # Cut vowel consonant / consonant+ vowel (trumpet, simple, understanding, ...) + start = 0 + end = 0 + while start < len (text)-1: + if text [start] in vowel_list and text [start+1] in consonant_list: + end = start + 2 + while end <= len (text)-1 and text [end] in consonant_list: + end = end + 1 + if end <= len (text)-1 and end > start+2 and text [end] in vowel_list: + if not (text [start+1:end] in consonant_blends): + text = text [:start+2] + "-" + text [start+2:] + start = start + 1 + + # Return the words splitted + return text + + def syllabalize(self,text): + ld=LangDetect() + lang=ld.detect_lang(text) + if(lang=="ml_IN"): + return self.syllabalize_ml(text) + if(lang=="hi_IN"): + return self.syllabalize_hi(text) + if(lang=="en_US"): + return self.syllabalize_en(text) + lst_chars=[] + for char in text: + lst_chars.append(char) + return lst_chars |