diff options
Diffstat (limited to 'silpa/modules')
-rw-r--r-- | silpa/modules/ngram/sample_ngram.py | 57 | ||||
-rw-r--r-- | silpa/modules/ngram/visualizer.py | 19 | ||||
-rw-r--r-- | silpa/modules/syllabalizer/syllabalizer.py | 10 |
3 files changed, 51 insertions, 35 deletions
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py index d373aa8..ec4b184 100644 --- a/silpa/modules/ngram/sample_ngram.py +++ b/silpa/modules/ngram/sample_ngram.py @@ -26,6 +26,27 @@ import codecs import pickle import sys from visualizer import NGramVisualizer +from optparse import OptionParser + +def getData(new_file_name): + line = [] + line_number = 0 + rule_number = 0 + corpus="" + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') + while 1: + line_number = line_number +1 + text = unicode( data_file.readline()) + if text == "": + break + if text[0] == '#': + continue + line_number = line_number +1 + line = text.strip() + if(line == ""): + continue + corpus=corpus+" "+line + return corpus def printGraph(corpus,start_word): ngv=NGramVisualizer () @@ -38,18 +59,24 @@ def printGraph(corpus,start_word): print graph.to_string().encode("utf-8") if __name__ == "__main__": - """ - python sample_ngram.py <input file> <corpus path> 1 - this will generate the corpus for the given input file, if corpus specified at - corpus path is empty. Else it will recreate the corpus with the additional data. - - python sample_ngram.py <corpus path> <start word> 2 - this will generate the graph for the given start word in the given corpus at corpus path. - - This is just a crude attempt, a lot more improvement is to be done. - """ - if sys.argv[3] == "1": - ngv=NGramVisualizer () - ngv.loadCorpus(sys.argv[1],sys.argv[2]) - elif sys.argv[3] == "2": - printGraph(sys.argv[1],sys.argv[2]) + usage = "usage: %prog [options] INPUTDATA CORPUSFILE" + parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE") + parser.set_usage(usage) + parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA") + parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA") + (options, args) = parser.parse_args() + if len(args) != 2 : + parser.error("incorrect number of arguments") + if options.infile and options.start_word: + parser.error("options -f and -s are mutually exclusive") + ngv=NGramVisualizer () + if options.infile: + data = getData(args[0]) + # print data + ngv.loadCorpus(data,args[1]) + elif options.start_word: + printGraph(args[1],args[0]) + else: + # print args[0] + data = args[0].decode("utf-8") + ngv.loadCorpus(data,args[1]) diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index e2c1136..585c11f 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -26,7 +26,7 @@ import pickle class NGramVisualizer: depth=0 - def loadCorpus(self,new_file_name,corpus_file_name): + def loadCorpus(self,corpus,corpus_file_name): limiters = [".","!","?",",",";"] try: corpusfile = open(corpus_file_name) @@ -35,23 +35,6 @@ class NGramVisualizer: else: graph_dict = pickle.load(corpusfile) # graph_dict = dict() - line = [] - line_number = 0 - rule_number = 0 - corpus="" - data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') - while 1: - line_number = line_number +1 - text = unicode( data_file.readline()) - if text == "": - break - if text[0] == '#': - continue - line_number = line_number +1 - line = text.strip() - if(line == ""): - continue - corpus=corpus+" "+line sentences=[] sentence = "" start = 0 diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py index 5cfde82..706ee77 100644 --- a/silpa/modules/syllabalizer/syllabalizer.py +++ b/silpa/modules/syllabalizer/syllabalizer.py @@ -32,11 +32,14 @@ class Syllabalizer(SilpaModule): u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?'] chandrakkala = u'\u0d4d' lst_chars = [] for char in text: - if char in signs: + if char in limiters: + lst_chars.append(char) + elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: @@ -53,11 +56,14 @@ class Syllabalizer(SilpaModule): u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941', u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948', u'\u094a', u'\u094b', u'\u094c', u'\u094d'] + limiters = ['.','\"','\'','`','!',';',',','?'] chandrakkala = u'\u094d' lst_chars = [] for char in text: - if char in signs: + if char in limiters: + lst_chars.append(char) + elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: |