summaryrefslogtreecommitdiffstats
path: root/silpa/modules/ngram/visualizer.py
diff options
context:
space:
mode:
authoraashiks <aashiks@gmail.com>2009-04-06 14:55:08 +0530
committeraashiks <aashiks@gmail.com>2009-04-06 14:55:08 +0530
commit061d7889a5f4b5faea4abc71996ae9bea259018d (patch)
tree8d8c6665873c028e5fe548741fe242286dc3c9ee /silpa/modules/ngram/visualizer.py
parent7bbd521108e0b0b095bd761d8d3db27195707999 (diff)
parent97f2231e942f67450b27f161da11e863334921ba (diff)
downloadRachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.tar.gz
Rachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.tar.xz
Rachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.zip
Merge branch 'master' of ssh://aashiks@git.sv.nongnu.org/srv/git/smc
Diffstat (limited to 'silpa/modules/ngram/visualizer.py')
-rw-r--r--silpa/modules/ngram/visualizer.py49
1 files changed, 37 insertions, 12 deletions
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index 0debe28..e2c1136 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -22,18 +22,27 @@
# URL: http://www.smc.org.in
import pydot
import codecs
+import pickle
class NGramVisualizer:
depth=0
- def loadCorpus(self, corpus_file_name):
- graph_dict = dict()
+ def loadCorpus(self,new_file_name,corpus_file_name):
+ limiters = [".","!","?",",",";"]
+ try:
+ corpusfile = open(corpus_file_name)
+ except IOError:
+ graph_dict = dict()
+ else:
+ graph_dict = pickle.load(corpusfile)
+ # graph_dict = dict()
line = []
line_number = 0
rule_number = 0
- corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
+ corpus=""
+ data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
while 1:
line_number = line_number +1
- text = unicode( corpus_file.readline())
+ text = unicode( data_file.readline())
if text == "":
break
if text[0] == '#':
@@ -42,10 +51,22 @@ class NGramVisualizer:
line = text.strip()
if(line == ""):
continue
+ corpus=corpus+" "+line
+ sentences=[]
+ sentence = ""
+ start = 0
+ for index in range(0,len(corpus)):
+ for delimit in limiters:
+ if corpus[index] == delimit:
+ sentence = corpus[start:index]
+ sentences.append(sentence)
+ start = index+1
+ for line in sentences:
words=line.split(" ")
word_count=len(words)
prev_word=""
for word in words:
+ #print word
word=word.strip()
if(prev_word==""):
prev_word=word
@@ -56,8 +77,10 @@ class NGramVisualizer:
else:
graph_dict[prev_word]=word
prev_word=word
- prev_word=""
- return graph_dict
+ prev_word=""
+
+ pickle.dump(graph_dict,open(corpus_file_name,'w'))
+ #return graph_dict
def generate_full_graph(self, start_word, graph_dict,outputimage):
for key in graph_dict.iterkeys():
@@ -93,10 +116,12 @@ class NGramVisualizer:
return graph
-if __name__ == "__main__":
- ngv=NGramVisualizer ()
- graph_dict=ngv.loadCorpus ("hi.txt")
- graph=pydot.Dot()
- graph=ngv.generate_graph(graph_dict, graph,u"भारत")
- print graph.to_string().encode("utf-8")
+#if __name__ == "__main__":
+# ngv=NGramVisualizer ()
+# graph_dict = dict()
+# graph_dict=ngv.loadCorpus ("ml.txt",graph_dict)
+# pickle.dump(graph_dict,open('ngram_ml.txt','w'))
+# graph=pydot.Dot()
+# graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
+# print graph.to_string().encode("utf-8")
#graph.write("ngvgraph-hi.png","dot", "raw" )