#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Ngram
# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

import codecs
import pickle
import pydot
import os,sys
from optparse import OptionParser
VERSION=0.1
MAX_TREE_DEPTH=1000
PICKLED_TREE="ngram.pyo"
class NgramNode:
	def __init__(self, node_value="*", rank=1, child_list=None):
		self.node_value=node_value
		self.rank=rank
		self.child_list=child_list
		self.desc="Start Node"
	def setNode(self, node_value="*", rank=None,childs=None, child_list=None):
		self.node_value=node_value
		self.rank=rank
		self.child_list=child_list
	def getName(self):
		return self.node_value	
	def getDesc(self):
		return self.desc
	def setDesc(self,desc):
		self.desc	=  desc
		return self.desc	
	def getRank(self):
		return self.rank	
	def setRank(self, rank):
		self.rank = rank
	def incrRank(self, incr=1):
		self.rank =  self.rank	+ incr
		return self.rank		
	def getChildList(self):
		if(self.child_list!=None):
			return self.child_list
		else:
			return None	
	def getChildByName(self,child_name):
		if(self.child_list==None):
			return None
		for child in self.child_list:
			if(child.getName()==child_name):
				return child
	def childIndex(self,childnode):
		if(self.child_list==None):
			return -1
		for child in self.child_list:
			if(child.getName()==childnode.getName()):
				return self.child_list.index(child)
		return -2		
	def printChildList(self):
		if(self.child_list==None):
			return None
		for child in self.child_list:
			print child,
	def addChildNode(self, node):
		if(node!=None):
			if(self.child_list==None):
				self.child_list=[]
			#Check whether this node is already present in the Ngram Tree	
			member_index=self.childIndex(node)
			if(member_index>=0):
				#Node already present.Incrementing Rank
				self.child_list[member_index].incrRank()
			else:
				self.child_list.append(node)		
		#Keep it sorted as per the ranks		
		self.child_list.sort()		
	def removeChildNode(self, node):
		if(node!=None & self.child_list!=None):
			self.child_list.remove(node)		
	def __str__(self):
		return "Node:  %s[%d]" % (self.node_value, self.rank)
	'''Recursively traverse through the tree and print the nodes-Depth First Traversal'''	
	def toString(self):
		print "Node:  %s[%d]" % (self.node_value, self.rank)
		child_list=self.getChildList()
		if(child_list!=None):
			for child_node in child_list :
				child_node.toString()
	'''Defining the less than operater of the object'''			
	def __lt__(self, node):
		return 	self.getRank() < node.getRank()
	'''Defining the greater than operater of the object'''
	def __gt__(self, node):
		return 	self.getRank() > node.getRank()
	'''Defining the equal-to operater of the object'''	
	def __eq__(self, node):
		if(node==None):
			return False
		return 	(self.getName() == node.getName()) & (self.getRank() == node.getRank())
	'''Defining the comparison of two object instances. Required for sorting the list of objects'''	
	def __cmp__(self, node):
		if(node==None):
			return 1
		if(self.getName()==node.getName()):
			return cmp(self.getRank(), node.getRank())
		else:
			return 1	
	

#Syllable Node Class
#Extends NgramNode class
class SyllableNode(NgramNode):
	def __str__(self):
		return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
#Word Node Class
#Extends NgramNode class
class WordNode(NgramNode):
	def __str__(self):
		return  ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')

class NGram:
	def __init__(self, text=None, language=None):
		self.text=None
		self.language=None
		try:
			#Try loading picked tree object
			self.ngrams=pickle.load(open(PICKLED_TREE))
			print "Loaded the ngram from " + PICKLED_TREE
		except:	
			#Initialize with empty node
			self.ngrams=NgramNode()
			print "New one"
		self.search_depth=0	
	def getRoot(self, node_name=None):
		if(node_name==None):
			return self.ngrams
		else:
			return self.searchNode(node_name)
				
	def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH):
		if(current_node==None):
			current_node=self.getRoot()
			self.search_depth = 0
		if(self.search_depth==depth):
			return None
		if(current_node.getName() == node_name):
			print "Found at depth", self.search_depth
			return current_node
		else:
			child_list=current_node.getChildList()
			if(child_list==None):
				return None
			else:
				child_list=child_list	
			self.search_depth = self.search_depth+1				
			for child_node in child_list :
				result_node=self.searchNodeByName(node_name,child_node, depth)
				if(result_node!=None):
					return result_node
	def printNgram(self, current_node=None):
		if(current_node==None):
			current_node=self.getRoot()
		print current_node
		child_list=current_node.getChildList()
		
		if(child_list==None):
			return None
		else:
			child_list.sort()	
		for child_node in child_list :
			self.printNgram(child_node)
	def toDot(self,  graph , current_node=None):
		if(current_node==None):
			current_node=self.getRoot()
		child_list=current_node.getChildList()
		if(child_list!=None):
			key=current_node.getName()
			for child_node in child_list:
				value=child_node.getName()
				if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1):
					key=key[0:len(key)-1]
				if(value!=None):	
					if((ord(value[len(value)-1])<=0x0901) & len(value)>1):
						value=value[0:len(value)-2]	
					graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
					self.toDot(graph,child_node)
	def toGraph(self, output_image_file):
		graph=pydot.Dot()
		self.toDot(graph)
		#print graph.to_string().encode('utf-8')
		graph.write(output_image_file,"dot", "png" )
		
	def addSyllables(self,text, window_size=2):
		words=text.split(" ")
		ngrams = []
		for word in words:
			#TODO-Normalize before taking ngram!!!
			word = "*"+word+"]"
			syllables = self.syllabalize_ml(word)
			syllable_count = len(syllables)
			window_start = 0
			window_end = 0
			while window_start + window_size <= syllable_count:
				if(window_start + window_size < syllable_count):
					window_end = window_start + window_size
				else:
					window_end = syllable_count	
				ngrams.append(syllables[window_start:window_end])
				window_start = window_start+1
		return 	ngrams
	'''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M'''		
	def syllabalize_ml(self,text):
		signs = [
		u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
		u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
		u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
		limiters = ['.','\"','\'','`','!',';',',','?', ']']
		chandrakkala = u'\u0d4d'
		lst_chars = []
		for char in text:
			if char in limiters:
				lst_chars.append(char)
			elif char in signs:
				lst_chars[-1] = lst_chars[-1] + char
			else:
				try:
					if lst_chars[-1][-1] == chandrakkala :
						lst_chars[-1] = lst_chars[-1] + char
					else:
						lst_chars.append(char)
				except IndexError:
					lst_chars.append(char)

		return lst_chars	
	def addWords(self,text, window_size=2):
		text = "* "+text+" ]"
		words = text.split(" ")
		ngrams = []
		word_count = len(words)
		window_start = 0
		window_end = 0
		while window_start + window_size <= word_count:
			if(window_start + window_size < word_count):
				window_end = window_start + window_size
			else:
				window_end = word_count	
			words[window_start:window_end]	
			ngrams.append(words[window_start:window_end])
			window_start = window_start+1
		return 	ngrams
	def populateSyllableNgram(self, text):
		ngrams = self.addSyllables(text)
		for ngram in ngrams:
			ngram_str=""
			for item in ngram:
				if(item.strip()>""):
					if(ngram_str==""):
						ngram_str=ngram_str+ item 
					else:
						
						if(ngram_str=="["):
							parent_node=self.getRoot()
						else:	
							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
						if(parent_node==None):
							print "Parent node not found for " + item
						else:	
							parent_node.addChildNode(SyllableNode(item))
							print ngram_str+ " -> "+item 	
		#pickle the tree				
		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))
 	def populateWordNgram(self, text):
		ng = NGram () 
		ngrams = ng.addWords(text)
		for ngram in ngrams:
			ngram_str=""
			for item in ngram:
				if(item.strip()>""):
					if(ngram_str==""):
						ngram_str=ngram_str+ item 
					else:
						if(ngram_str=="*"):
							parent_node=self.getRoot()
						else:	
							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
						if(parent_node==None):
							print "Parent node not found for " + item
						else:	
							parent_node.addChildNode(WordNode(item))
							print ngram_str+ " -> "+item 	
		#pickle the tree				
		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))	
if __name__ == "__main__":
	usage = "usage: %prog [options] inputfile"
	parser = OptionParser(version="%prog 0.1",description="Malayalama NGram Analyser")
	parser.set_usage(usage)
	parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram")
	parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram")
	parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning")
	parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ")
	parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ")
	(options, args) = parser.parse_args()
	
	if(options.gen_graph):
		ng = NGram () 	
		ng.toGraph(options.gen_graph)
	if(options.	input_file):
		if not os.path.exists(options.input_file):
			print "File Doesnot Existis"
			sys.exit(1)
		else:
			corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore')
			ng = NGram () 	
			while 1:
	   			text = unicode( corpus_file.readline())	
	   			if text == "":
					break
				text= text + " ]"	
				ng.populateSyllableNgram(text)
				ng.populateWordNgram(text)
			print "Populated"
	if(options.	print_ngram):
		ng = NGram () 	
		print ng.getRoot().toString()
	if(options.	suggest_syllables):
		ng = NGram () 	
		print "Searching for" + options.suggest_words
		print ng.searchNodeByName(unicode(options.	suggest_syllables))
	if(options.	suggest_syllables):
		ng = NGram () 	
		print "Searching  for "+ options.suggest_words
		print ng.searchNodeByName(unicode(options.	suggest_words))