1 files changed, 149 insertions, 0 deletions
diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py
new file mode 100644
index 0000000..deed058
--- /dev/null
+++ b/silpa/modules/syllabalizer/syllabalizer.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+"""mlsplit - Split Malayalam words into letters
+
+This script splits Malayalam words into letters.
+Ref: http://tinyurl.com/3v729s
+
+
+
+Copyright (C) 2008 Baiju M <baiju.m.mail AT gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import re
+import codecs
+from langdetect import LangDetect
+class Syllabalizer:
+	def syllabalize_ml(self,text):
+		signs = [
+		u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+		u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+		u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+
+		chandrakkala = u'\u0d4d'
+		lst_chars = []
+		for char in text:
+			if char in signs:
+				lst_chars[-1] = lst_chars[-1] + char
+			else:
+				try:
+					if lst_chars[-1][-1] == chandrakkala:
+						lst_chars[-1] = lst_chars[-1] + char
+					else:
+						lst_chars.append(char)
+				except IndexError:
+					lst_chars.append(char)
+
+		return lst_chars
+	def syllabalize_hi(self,text):
+		signs = [
+		u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941',
+		u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948',
+		u'\u094a', u'\u094b', u'\u094c', u'\u094d']
+
+		chandrakkala = u'\u094d'
+		lst_chars = []
+		for char in text:
+			if char in signs:
+				lst_chars[-1] = lst_chars[-1] + char
+			else:
+				try:
+					if lst_chars[-1][-1] == chandrakkala:
+						lst_chars[-1] = lst_chars[-1] + char
+					else:
+						lst_chars.append(char)
+				except IndexError:
+					lst_chars.append(char)
+
+		return lst_chars	
+	#Source: http://www.python-forum.org/pythonforum/viewtopic.php?f=14&t=5810#p42091
+	#Author: Cabu
+	def syllabalize_en(self,text):
+		text = " " + text + " "
+		vowel_list       = ['a', 'e', 'i', 'o', 'u', 'y']
+		vowel_pairs      = ['ai', 'au', 'aw', 'ee','ea', 'oa', 'oi', 'ou', 'oo', 'ow', 'oy', 'uu']
+		consonant_list   = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']
+		consonant_blends = ['bl', 'br', 'ch', 'chr', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gr', 'kn', 'pl', 'pr',
+							'sc', 'sh', 'sk', 'sl', 'sm', 'sn', 'sp', 'spr', 'squ', 'st', 'str', 'sw',
+							'th', 'tr', 'thr', 'nt', 'wh']
+
+		# Cut numbers in digits
+		p = re.compile ("([0-9])([0-9])", re.IGNORECASE)
+		for i in range (2):
+			text = p.sub ("\\1#\\2", text)
+		   
+		# Cut i / vowel (- o) / consonant
+		p = re.compile ("i([aeiuy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
+		text = p.sub ("i+\\1+\\2", text)
+	   
+		# Cut the / vowel / consonant
+		p = re.compile ("the([aeiouy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
+		text = p.sub ("the+\\1+\\2", text)
+	   
+		# Cut vowel / vowel except for pairs
+		position = 0
+		while position < len (text)-1:
+			if text [position] in vowel_list and text [position+1] in vowel_list:
+				if not (text [position:position+2] in vowel_pairs):
+					if not (text [position-1:position+3] in ["tion", "dual", "nion", "quir", "tiou"]):
+						text = text [:position+1] + "_" + text [position+1:]
+			position = position + 1
+		   
+		# Cut consonant / consonant (ll, mm, ...)
+		p = re.compile ("([bcdfghjklmnpqrstvwxz])\\1([^ ])", re.IGNORECASE)
+		text = p.sub ("\\1-\\1\\2", text)
+	   
+		# Cut vowel / consonant vowel
+		start = 0
+		end = 0
+		while start < len (text)-1:
+			if text [start] in vowel_list and text [start+1] in consonant_list:
+				end = start + 1
+				while end <= len (text)-1 and text [end] in consonant_list:
+					end = end + 1
+				if end <= len (text)-1 and (text [start+1:end] in consonant_list or text [start+1:end] in consonant_blends) and text [end] in vowel_list and text [end:end+2] <> "e ":
+					text = text [:start+1] + "/" + text [start+1:]
+			start = start + 1
+		   
+		# Cut vowel consonant / consonant+ vowel (trumpet, simple, understanding, ...)
+		start = 0
+		end = 0
+		while start < len (text)-1:
+			if text [start] in vowel_list and text [start+1] in consonant_list:
+				end = start + 2
+				while end <= len (text)-1 and text [end] in consonant_list:
+					end = end + 1
+				if end <= len (text)-1 and end > start+2 and text [end] in vowel_list:
+					if not (text [start+1:end] in consonant_blends):
+						text = text [:start+2] + "-" + text [start+2:]
+			start = start + 1
+
+		# Return the words splitted
+		return text
+
+	def syllabalize(self,text):
+		ld=LangDetect()
+		lang=ld.detect_lang(text)
+		if(lang=="ml_IN"):
+			return self.syllabalize_ml(text)
+		if(lang=="hi_IN"):
+			return self.syllabalize_hi(text)
+		if(lang=="en_US"):
+			return self.syllabalize_en(text)
+		lst_chars=[]
+		for  char in text:
+			lst_chars.append(char)
+		return lst_chars