summaryrefslogtreecommitdiffstats
path: root/silpa/modules/ngram/sample_ngram.py
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules/ngram/sample_ngram.py')
-rw-r--r--silpa/modules/ngram/sample_ngram.py82
1 files changed, 82 insertions, 0 deletions
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
new file mode 100644
index 0000000..ec4b184
--- /dev/null
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -0,0 +1,82 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2009 Jinesh K J <jinesh.k@gmail.com>
+# Copyright 2009 Swathantra Malayalam Computing <smc-discuss@googlegroups.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: jinesh.k@gmail.com or smc-discuss@googlegroups.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+import pickle
+import sys
+from visualizer import NGramVisualizer
+from optparse import OptionParser
+
+def getData(new_file_name):
+ line = []
+ line_number = 0
+ rule_number = 0
+ corpus=""
+ data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
+ while 1:
+ line_number = line_number +1
+ text = unicode( data_file.readline())
+ if text == "":
+ break
+ if text[0] == '#':
+ continue
+ line_number = line_number +1
+ line = text.strip()
+ if(line == ""):
+ continue
+ corpus=corpus+" "+line
+ return corpus
+
+def printGraph(corpus,start_word):
+ ngv=NGramVisualizer ()
+ graph_dict=pickle.load(open(corpus))
+ graph=pydot.Dot()
+ uni_start_word = start_word.decode("utf-8")
+# print start_word
+# print uni_start_word
+ graph=ngv.generate_graph(graph_dict, graph,uni_start_word)
+ print graph.to_string().encode("utf-8")
+
+if __name__ == "__main__":
+ usage = "usage: %prog [options] INPUTDATA CORPUSFILE"
+ parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE")
+ parser.set_usage(usage)
+ parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA")
+ parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA")
+ (options, args) = parser.parse_args()
+ if len(args) != 2 :
+ parser.error("incorrect number of arguments")
+ if options.infile and options.start_word:
+ parser.error("options -f and -s are mutually exclusive")
+ ngv=NGramVisualizer ()
+ if options.infile:
+ data = getData(args[0])
+ # print data
+ ngv.loadCorpus(data,args[1])
+ elif options.start_word:
+ printGraph(args[1],args[0])
+ else:
+ # print args[0]
+ data = args[0].decode("utf-8")
+ ngv.loadCorpus(data,args[1])