#!/usr/bin/python3
import os
import os.path
from argparse import ArgumentParser
from operator import itemgetter
import utils
from myconfig import MyConfig
from dirwalk import walkIndex

config = MyConfig()

#change cwd to the generate punctuation directory
puncts_dir = config.getGeneratePunctuationDir()
os.path.exists(puncts_dir) or os.makedirs(puncts_dir)
os.chdir(puncts_dir)
#chdir done

# The order is important
Punct_Search = ['……', '…', '，', '。', '；', '？', '！', '：', '“', '”', '、']

all_punct_pairs = {}

############################################################
#                  Handle File                             #
############################################################

def handleOneText(infile, punct_pairs):
    global Punct_Search
    print(infile)

    sep = config.getWordSep()

    #train
    docfile = open(infile + config.getSegmentPostfix(), 'r')

    (prev_token, prev_str) = (0, '')
    (cur_token, cur_str) = (0, '')
    for oneline in docfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (cur_token, cur_str) = oneline.split(" ", 1)
        cur_token = int(cur_token)

        if prev_token == 0:
            (prev_token, prev_str) = (cur_token, cur_str)
            continue

        #search the punct here
        cur_punct = ''
        if cur_token == 0:
            for punct in Punct_Search:
                if cur_str.startswith(punct):
                    cur_punct = punct
                    break

        if cur_punct == '':
            (prev_token, prev_str) = (cur_token, cur_str)
            continue

        #save the punct
        if (prev_token, prev_str) in punct_pairs:
            puncts = punct_pairs[(prev_token, prev_str)]
            for punct in puncts:
                if cur_punct == punct[0]:
                    punct[1] += 1
                    cur_punct = ''
                    #print(punct[0], punct[1])
                    break
            if cur_punct != '':
                puncts.append([cur_punct, 1])
        else:
            puncts = []
            puncts.append([cur_punct, 1])
            punct_pairs[(prev_token, prev_str)] = puncts

        (prev_token, prev_str) = (cur_token, cur_str)

    docfile.close()

def prunePunctPair(workdir, threshold, infilename, outfilename):
    punct_pairs = {}
    #load the punct pairs from text files
    punctfile = os.path.join(workdir, infilename)
    with open(punctfile, 'r') as f:
        punct_pairs = eval(f.read())

    #prune the punct pairs below threshold
    newpunctpairs = {}
    for key, puncts in punct_pairs.items():
        (token, word) = key
        newpuncts = []
        for punct, freq in puncts:
            if freq < threshold:
                continue
            newpuncts.append([punct, freq])
        if len(newpuncts) > 0:
            newpunctpairs[key] = newpuncts

    #save the punct pairs to text files
    punctfile = os.path.join(workdir, outfilename)
    with open(punctfile, 'w') as f:
        f.write(repr(newpunctpairs))


############################################################
#                  Handle Index                            #
############################################################

def loadPunctPairFromOneIndex(indexpath, workdir):
    punct_pairs = {}
    #begin processing
    indexfile = open(indexpath, 'r')
    for i, oneline in enumerate(indexfile.readlines()):
        #remove trailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')
        infile = config.getTextDir() + textpath
        infilesize = utils.get_file_length(infile + config.getSegmentPostfix())
        if infilesize < config.getMinimumFileSize():
            print("Skipping " + title + '#' + textpath)
            continue

        print("Proccessing " + title + '#' + textpath)
        handleOneText(infile, punct_pairs)
        print("Processed " + title + '#' + textpath)

    indexfile.close()
    #end processing

    #save the punct pairs to text files
    punctfile = os.path.join(workdir, \
                             config.getPunctuationPerIndexFileName())
    with open(punctfile, 'w') as f:
        f.write(repr(punct_pairs))


def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'Punctuation'):
        return

    workdir = config.getGeneratePunctuationDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    # Iterate the files in this index
    os.path.exists(workdir) or os.makedirs(workdir)

    # Load all word and punctuation pair in this index
    loadPunctPairFromOneIndex(indexpath, workdir)

    # Prune the pair in the current index
    prunePunctPair(workdir, \
                   config.getPunctuationPerIndexPruneThreshold(), \
                   config.getPunctuationPerIndexFileName(), \
                   config.getPunctuationPruneIndexFileName())

    #sign epoch
    utils.sign_epoch(indexstatus, 'Punctuation')
    utils.store_status(indexstatuspath, indexstatus)

def loadOnePrune(indexpath, subdir, indexname):
    global all_punct_pairs
    print(indexpath, subdir, indexname)

    workdir = config.getGeneratePunctuationDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    # Load the word and punctuation pair
    punct_pairs = {}
    #load the punct pairs from text files
    punctfile = os.path.join(workdir, \
                             config.getPunctuationPruneIndexFileName())
    with open(punctfile, 'r') as f:
        punct_pairs = eval(f.read())

    #merge into all punct pairs
    for key, puncts in punct_pairs.items():
        if key not in all_punct_pairs:
            all_punct_pairs[key] = puncts
            continue

        #combine the puncts
        newpuncts = []
        oldpuncts = all_punct_pairs[key]
        keys = set()
        for punct, freq in oldpuncts + puncts:
            keys.add(punct)
        for punctkey in keys:
            #old freq
            oldfreq = [freq for punct, freq in oldpuncts if punct == punctkey]
            #print("old freq", oldfreq)
            freq = sum(oldfreq)
            #new freq
            newfreq = [freq for punct, freq in puncts if punct == punctkey]
            freq += sum(newfreq)
            newpuncts.append([punctkey, freq])

        all_punct_pairs[key] = newpuncts


def exportAllPunctPairs(workdir, infilename, outfilename):
    # Load the word and punctuation pair
    punct_pairs = {}
    #load the punct pairs from text files
    punctfile = os.path.join(workdir, infilename)
    with open(punctfile, 'r') as f:
        punct_pairs = eval(f.read())

    tablefile = open(outfilename, 'w')
    for key, puncts in punct_pairs.items():
        (token, word) = key
        puncts.sort(key=itemgetter(1), reverse=True)
        for punct, freq in puncts:
            line = "{0} {1} {2} {3}".format(token, word, punct, freq)
            tablefile.writelines([line, os.linesep])
    tablefile.close()

if __name__ == '__main__':
    parser = ArgumentParser(description='Generate punctuation.')
    parser.add_argument('--indexdir', action='store', \
                            help='index directory', \
                            default=config.getTextIndexDir())


    args = parser.parse_args()
    print(args)
    walkIndex(handleOneIndex, args.indexdir)
    # Merge the word and punctuation pairs in all the index
    walkIndex(loadOnePrune, args.indexdir)

    #save the punct pairs to text files
    punctfile = os.path.join(config.getGeneratePunctuationDir(), \
                             config.getPunctuationAllIndexFileName())
    with open(punctfile, 'w') as f:
        f.write(repr(all_punct_pairs))

    # Prune the pairs in all the index
    prunePunctPair(config.getGeneratePunctuationDir(), \
                   config.getPunctuationAllIndexPruneThreshold(), \
                   config.getPunctuationAllIndexFileName(), \
                   config.getPunctuationPruneAllIndexFileName())


    # Export all the remaining pairs
    exportAllPunctPairs(config.getGeneratePunctuationDir(), \
                        config.getPunctuationPruneAllIndexFileName(), \
                        config.getPunctuationTextFileName())

    print('done')