summaryrefslogtreecommitdiffstats
path: root/silpa
diff options
context:
space:
mode:
authorJinesh K J <jinsbond007@jinesh.cvit>2009-04-07 03:10:02 +0530
committerJinesh K J <jinsbond007@jinesh.cvit>2009-04-07 03:10:02 +0530
commit1b62cfff2a910765b700bbea15786a1d62d800ef (patch)
tree459ff9e5ea25fe93a6114060d6e65f0ab1601559 /silpa
parent97f2231e942f67450b27f161da11e863334921ba (diff)
downloadRachana.git-1b62cfff2a910765b700bbea15786a1d62d800ef.tar.gz
Rachana.git-1b62cfff2a910765b700bbea15786a1d62d800ef.tar.xz
Rachana.git-1b62cfff2a910765b700bbea15786a1d62d800ef.zip
Updated the ngram module with option parser
Diffstat (limited to 'silpa')
-rw-r--r--silpa/modules/ngram/sample_ngram.py57
-rw-r--r--silpa/modules/ngram/visualizer.py19
2 files changed, 43 insertions, 33 deletions
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
index d373aa8..ec4b184 100644
--- a/silpa/modules/ngram/sample_ngram.py
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -26,6 +26,27 @@ import codecs
import pickle
import sys
from visualizer import NGramVisualizer
+from optparse import OptionParser
+
+def getData(new_file_name):
+ line = []
+ line_number = 0
+ rule_number = 0
+ corpus=""
+ data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
+ while 1:
+ line_number = line_number +1
+ text = unicode( data_file.readline())
+ if text == "":
+ break
+ if text[0] == '#':
+ continue
+ line_number = line_number +1
+ line = text.strip()
+ if(line == ""):
+ continue
+ corpus=corpus+" "+line
+ return corpus
def printGraph(corpus,start_word):
ngv=NGramVisualizer ()
@@ -38,18 +59,24 @@ def printGraph(corpus,start_word):
print graph.to_string().encode("utf-8")
if __name__ == "__main__":
- """
- python sample_ngram.py <input file> <corpus path> 1
- this will generate the corpus for the given input file, if corpus specified at
- corpus path is empty. Else it will recreate the corpus with the additional data.
-
- python sample_ngram.py <corpus path> <start word> 2
- this will generate the graph for the given start word in the given corpus at corpus path.
-
- This is just a crude attempt, a lot more improvement is to be done.
- """
- if sys.argv[3] == "1":
- ngv=NGramVisualizer ()
- ngv.loadCorpus(sys.argv[1],sys.argv[2])
- elif sys.argv[3] == "2":
- printGraph(sys.argv[1],sys.argv[2])
+ usage = "usage: %prog [options] INPUTDATA CORPUSFILE"
+ parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE")
+ parser.set_usage(usage)
+ parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA")
+ parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA")
+ (options, args) = parser.parse_args()
+ if len(args) != 2 :
+ parser.error("incorrect number of arguments")
+ if options.infile and options.start_word:
+ parser.error("options -f and -s are mutually exclusive")
+ ngv=NGramVisualizer ()
+ if options.infile:
+ data = getData(args[0])
+ # print data
+ ngv.loadCorpus(data,args[1])
+ elif options.start_word:
+ printGraph(args[1],args[0])
+ else:
+ # print args[0]
+ data = args[0].decode("utf-8")
+ ngv.loadCorpus(data,args[1])
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index e2c1136..585c11f 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -26,7 +26,7 @@ import pickle
class NGramVisualizer:
depth=0
- def loadCorpus(self,new_file_name,corpus_file_name):
+ def loadCorpus(self,corpus,corpus_file_name):
limiters = [".","!","?",",",";"]
try:
corpusfile = open(corpus_file_name)
@@ -35,23 +35,6 @@ class NGramVisualizer:
else:
graph_dict = pickle.load(corpusfile)
# graph_dict = dict()
- line = []
- line_number = 0
- rule_number = 0
- corpus=""
- data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
- while 1:
- line_number = line_number +1
- text = unicode( data_file.readline())
- if text == "":
- break
- if text[0] == '#':
- continue
- line_number = line_number +1
- line = text.strip()
- if(line == ""):
- continue
- corpus=corpus+" "+line
sentences=[]
sentence = ""
start = 0