#!/usr/bin/python3
import os
import os.path
import sqlite3
from argparse import ArgumentParser
from operator import itemgetter
from math import log
from sys import float_info
import utils
from myconfig import MyConfig
from dirwalk import walkIndex


config = MyConfig()

#change cwd to the word recognizer directory
words_dir = config.getWordRecognizerDir()
os.chdir(words_dir)
#chdir done


############################################################
#                Create Bigram Database                    #
############################################################


CREATE_BIGRAM_DDL = '''
CREATE TABLE bigram (
      prefix TEXT NOT NULL,
      postfix TEXT NOT NULL,
      freq INTEGER NOT NULL
      );
'''

CREATE_BIGRAM_PREFIX_INDEX_DDL = '''
CREATE INDEX bigram_prefix_index on bigram(prefix);
'''

CREATE_BIGRAM_POSTFIX_INDEX_DDL = '''
CREATE INDEX bigram_postfix_index on bigram(postfix);
'''

SELECT_ALL_NGRAM_DML = '''
SELECT words, freq FROM ngram;
'''

INSERT_BIGRAM_DML = '''
INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?);
'''


def createBigramSqlite(workdir):
    print(workdir, 'create bigram')

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename
    print(filepath)

    if os.access(filepath, os.F_OK):
        os.unlink(filepath)

    conn = sqlite3.connect(filepath)
    cur = conn.cursor()
    cur.execute(CREATE_BIGRAM_DDL)
    cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL)
    cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL)
    conn.commit()
    if conn:
        conn.close()


def populateBigramSqlite(workdir):
    print(workdir, 'populate bigram')

    sep = config.getWordSep()

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename

    bigram_conn = sqlite3.connect(filepath)
    bigram_cur = bigram_conn.cursor()

    length = 2
    filename = config.getNgramFileName(length)
    filepath = workdir + os.sep + filename

    ngram_conn = sqlite3.connect(filepath)
    ngram_cur = ngram_conn.cursor()

    #begin processing
    rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall()
    for row in rows:
        (words_str, freq) = row

        words = words_str.strip(sep).split(sep, 1)
        assert len(words) == length

        (prefix, postfix) = words

        bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq))
        #print(prefix, postfix, freq)

    bigram_conn.commit()
    ngram_conn.commit()

    if bigram_conn:
        bigram_conn.close()
    if ngram_conn:
        ngram_conn.close()


############################################################
#             Information Entropy Model                    #
############################################################

def computeEntropy(freqs):
    #print(freqs)

    totalfreq = sum(freqs)
    freqs = [ freq / float(totalfreq) for freq in freqs ]
    assert abs(1 - sum(freqs)) < len(freqs) * float_info.epsilon

    entropy = - sum([ freq * log(freq) for freq in freqs ])
    return entropy


############################################################
#                Get Threshold Pass                        #
############################################################


SELECT_PREFIX_DML = '''
SELECT prefix, freq FROM bigram WHERE postfix = ? ;
'''

SELECT_POSTFIX_DML = '''
SELECT postfix, freq FROM bigram WHERE prefix = ? ;
'''


def computePrefixEntropy(cur, word):
    rows = cur.execute(SELECT_PREFIX_DML, (word, )).fetchall()
    if 0 == len(rows):
        return 0.

    freqs = []
    for row in rows:
        (prefix, freq) = row
        assert freq >= 1
        freqs.append(freq)

    return computeEntropy(freqs)


def computePostfixEntropy(cur, word):

    rows = cur.execute(SELECT_POSTFIX_DML, (word, )).fetchall()
    if 0 == len(rows):
        return 0.

    freqs = []
    for row in rows:
        (postfix, freq) = row
        assert freq >= 1
        freqs.append(freq)

    return computeEntropy(freqs)


def computeThreshold(conn, tag):
    cur = conn.cursor()

    wordswithentropy = []
    wordlistfile = open(config.getWordsListFileName(), "r")

    for oneline in wordlistfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        word = oneline

        entropy = 0.
        if "prefix" == tag:
            entropy = computePrefixEntropy(cur, word)
        elif "postfix" == tag:
            entropy = computePostfixEntropy(cur, word)
        else:
            raise "invalid tag value."

        #print(word, entropy)

        if entropy < config.getMinimumEntropy():
            continue

        wordswithentropy.append((word, entropy))

    wordlistfile.close()

    conn.commit()

    #ascending sort
    wordswithentropy.sort(key=itemgetter(1))
    pos = int(len(wordswithentropy) * config.getNewWordThreshold())
    (word, threshold) = wordswithentropy[-pos]
    print(word, tag, threshold)
    return threshold


############################################################
#                  Get Word Pass                           #
############################################################

def filterPartialWord(workdir, conn, prethres, postthres):
    words_set = set([])
    cur = conn.cursor()

    filepath = workdir + os.sep + config.getPartialWordFileName()
    partialwordfile = open(filepath, "r")

    filepath = workdir + os.sep + config.getNewWordFileName()
    newwordfile = open(filepath, "w")

    for oneline in partialwordfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (word, prefix, postfix, freq) = oneline.split(None, 3)

        if word in words_set:
            continue

        entropy = computePrefixEntropy(cur, word)
        if entropy < prethres:
            continue
        entropy = computePostfixEntropy(cur, word)
        if entropy < postthres:
            continue

        print(word)
        newwordfile.writelines([word, os.linesep])
        words_set.add(word)

    newwordfile.close()
    partialwordfile.close()
    conn.commit()


############################################################
#                  Handle Index                            #
############################################################

def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'PartialWord'):
        raise utils.EpochError('Please partial word first.\n')
    if utils.check_epoch(indexstatus, 'NewWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    createBigramSqlite(workdir)
    populateBigramSqlite(workdir)

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename

    conn = sqlite3.connect(filepath)

    prethres = computeThreshold(conn, "prefix")
    indexstatus['NewWordPrefixThreshold'] = prethres
    postthres = computeThreshold(conn, "postfix")
    indexstatus['NewWordPostfixThreshold'] = postthres

    utils.store_status(indexstatuspath, indexstatus)

    filterPartialWord(workdir, conn, prethres, postthres)

    conn.commit()
    if conn:
        conn.close()

    #sign epoch
    utils.sign_epoch(indexstatus, 'NewWord')
    utils.store_status(indexstatuspath, indexstatus)


if __name__ == '__main__':
    parser = ArgumentParser(description='Recognize new words.')
    parser.add_argument('--indexdir', action='store', \
                            help='index directory', \
                            default=config.getTextIndexDir())


    args = parser.parse_args()
    print(args)
    walkIndex(handleOneIndex, args.indexdir)
    print('done')