diff options
Diffstat (limited to 'silpa/modules/ngram/sample_ngram.py')
-rw-r--r-- | silpa/modules/ngram/sample_ngram.py | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py new file mode 100644 index 0000000..ec4b184 --- /dev/null +++ b/silpa/modules/ngram/sample_ngram.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2009 Jinesh K J <jinesh.k@gmail.com> +# Copyright 2009 Swathantra Malayalam Computing <smc-discuss@googlegroups.com> +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: jinesh.k@gmail.com or smc-discuss@googlegroups.com +# URL: http://www.smc.org.in +import pydot +import codecs +import pickle +import sys +from visualizer import NGramVisualizer +from optparse import OptionParser + +def getData(new_file_name): + line = [] + line_number = 0 + rule_number = 0 + corpus="" + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') + while 1: + line_number = line_number +1 + text = unicode( data_file.readline()) + if text == "": + break + if text[0] == '#': + continue + line_number = line_number +1 + line = text.strip() + if(line == ""): + continue + corpus=corpus+" "+line + return corpus + +def printGraph(corpus,start_word): + ngv=NGramVisualizer () + graph_dict=pickle.load(open(corpus)) + graph=pydot.Dot() + uni_start_word = start_word.decode("utf-8") +# print start_word +# print uni_start_word + graph=ngv.generate_graph(graph_dict, graph,uni_start_word) + print graph.to_string().encode("utf-8") + +if __name__ == "__main__": + usage = "usage: %prog [options] INPUTDATA CORPUSFILE" + parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE") + parser.set_usage(usage) + parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA") + parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA") + (options, args) = parser.parse_args() + if len(args) != 2 : + parser.error("incorrect number of arguments") + if options.infile and options.start_word: + parser.error("options -f and -s are mutually exclusive") + ngv=NGramVisualizer () + if options.infile: + data = getData(args[0]) + # print data + ngv.loadCorpus(data,args[1]) + elif options.start_word: + printGraph(args[1],args[0]) + else: + # print args[0] + data = args[0].decode("utf-8") + ngv.loadCorpus(data,args[1]) |