summaryrefslogtreecommitdiffstats
path: root/silpa/modules/syllabalizer/syllabalizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules/syllabalizer/syllabalizer.py')
-rw-r--r--silpa/modules/syllabalizer/syllabalizer.py149
1 files changed, 149 insertions, 0 deletions
diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py
new file mode 100644
index 0000000..deed058
--- /dev/null
+++ b/silpa/modules/syllabalizer/syllabalizer.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+"""mlsplit - Split Malayalam words into letters
+
+This script splits Malayalam words into letters.
+Ref: http://tinyurl.com/3v729s
+
+
+
+Copyright (C) 2008 Baiju M <baiju.m.mail AT gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import re
+import codecs
+from langdetect import LangDetect
+class Syllabalizer:
+ def syllabalize_ml(self,text):
+ signs = [
+ u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+ u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+ u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+
+ chandrakkala = u'\u0d4d'
+ lst_chars = []
+ for char in text:
+ if char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == chandrakkala:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+ def syllabalize_hi(self,text):
+ signs = [
+ u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941',
+ u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948',
+ u'\u094a', u'\u094b', u'\u094c', u'\u094d']
+
+ chandrakkala = u'\u094d'
+ lst_chars = []
+ for char in text:
+ if char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == chandrakkala:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+ #Source: http://www.python-forum.org/pythonforum/viewtopic.php?f=14&t=5810#p42091
+ #Author: Cabu
+ def syllabalize_en(self,text):
+ text = " " + text + " "
+ vowel_list = ['a', 'e', 'i', 'o', 'u', 'y']
+ vowel_pairs = ['ai', 'au', 'aw', 'ee','ea', 'oa', 'oi', 'ou', 'oo', 'ow', 'oy', 'uu']
+ consonant_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']
+ consonant_blends = ['bl', 'br', 'ch', 'chr', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gr', 'kn', 'pl', 'pr',
+ 'sc', 'sh', 'sk', 'sl', 'sm', 'sn', 'sp', 'spr', 'squ', 'st', 'str', 'sw',
+ 'th', 'tr', 'thr', 'nt', 'wh']
+
+ # Cut numbers in digits
+ p = re.compile ("([0-9])([0-9])", re.IGNORECASE)
+ for i in range (2):
+ text = p.sub ("\\1#\\2", text)
+
+ # Cut i / vowel (- o) / consonant
+ p = re.compile ("i([aeiuy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
+ text = p.sub ("i+\\1+\\2", text)
+
+ # Cut the / vowel / consonant
+ p = re.compile ("the([aeiouy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
+ text = p.sub ("the+\\1+\\2", text)
+
+ # Cut vowel / vowel except for pairs
+ position = 0
+ while position < len (text)-1:
+ if text [position] in vowel_list and text [position+1] in vowel_list:
+ if not (text [position:position+2] in vowel_pairs):
+ if not (text [position-1:position+3] in ["tion", "dual", "nion", "quir", "tiou"]):
+ text = text [:position+1] + "_" + text [position+1:]
+ position = position + 1
+
+ # Cut consonant / consonant (ll, mm, ...)
+ p = re.compile ("([bcdfghjklmnpqrstvwxz])\\1([^ ])", re.IGNORECASE)
+ text = p.sub ("\\1-\\1\\2", text)
+
+ # Cut vowel / consonant vowel
+ start = 0
+ end = 0
+ while start < len (text)-1:
+ if text [start] in vowel_list and text [start+1] in consonant_list:
+ end = start + 1
+ while end <= len (text)-1 and text [end] in consonant_list:
+ end = end + 1
+ if end <= len (text)-1 and (text [start+1:end] in consonant_list or text [start+1:end] in consonant_blends) and text [end] in vowel_list and text [end:end+2] <> "e ":
+ text = text [:start+1] + "/" + text [start+1:]
+ start = start + 1
+
+ # Cut vowel consonant / consonant+ vowel (trumpet, simple, understanding, ...)
+ start = 0
+ end = 0
+ while start < len (text)-1:
+ if text [start] in vowel_list and text [start+1] in consonant_list:
+ end = start + 2
+ while end <= len (text)-1 and text [end] in consonant_list:
+ end = end + 1
+ if end <= len (text)-1 and end > start+2 and text [end] in vowel_list:
+ if not (text [start+1:end] in consonant_blends):
+ text = text [:start+2] + "-" + text [start+2:]
+ start = start + 1
+
+ # Return the words splitted
+ return text
+
+ def syllabalize(self,text):
+ ld=LangDetect()
+ lang=ld.detect_lang(text)
+ if(lang=="ml_IN"):
+ return self.syllabalize_ml(text)
+ if(lang=="hi_IN"):
+ return self.syllabalize_hi(text)
+ if(lang=="en_US"):
+ return self.syllabalize_en(text)
+ lst_chars=[]
+ for char in text:
+ lst_chars.append(char)
+ return lst_chars