summaryrefslogtreecommitdiffstats
path: root/scripts/pyutil.py
diff options
context:
space:
mode:
authorPeng Huang <shawn.p.huang@gmail.com>2009-10-05 11:54:20 +0800
committerPeng Huang <shawn.p.huang@gmail.com>2009-10-05 11:54:20 +0800
commit81070b09ca9a8bec2ab76006aa049f460904e23a (patch)
treea01abcd4766aa6fbb825851a67cfda71804063cb /scripts/pyutil.py
parentd866b6b936220d6f3f95a24a0d3c762186134ba6 (diff)
downloadibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.tar.gz
ibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.tar.xz
ibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.zip
Move all scripts to $top_srcdir/scripts
Diffstat (limited to 'scripts/pyutil.py')
-rw-r--r--scripts/pyutil.py148
1 files changed, 148 insertions, 0 deletions
diff --git a/scripts/pyutil.py b/scripts/pyutil.py
new file mode 100644
index 0000000..48edde7
--- /dev/null
+++ b/scripts/pyutil.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# ibus-pinyin - The PinYin engine for IBus
+#
+# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+from pydict import *
+
+class PinYinWord:
+ correct_dict = {"nve" : "nue", "lve" : "lue"}
+ def __init__ (self, pinyin):
+ if pinyin in self.correct_dict:
+ pinyin = self.correct_dict [pinyin]
+
+ self._pinyin = pinyin
+ self._is_completed = self.is_valid_pinyin ()
+ if self._is_completed:
+ sheng_mu, yun_mu = self.split ()
+ self._pinyin_id = PINYIN_DICT [self._pinyin]
+ self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
+ else:
+ self._sheng_mu_id = SHENGMU_DICT [self._pinyin]
+
+ def is_valid_pinyin (self):
+ return PINYIN_DICT.has_key (self._pinyin)
+
+ def get_sheng_mu_id (self):
+ return self._sheng_mu_id
+
+ def get_shengmu (self):
+ return ID_SHENGMU_DICT[self._sheng_mu_id]
+
+ def get_pinyin_id (self):
+ return self._pinyin_id
+
+ def get_pinyin (self):
+ return self._pinyin
+
+ def get_pattern (self, mohu = False):
+ if mohu == False:
+ if self.is_valid_pinyin ():
+ return self._pinyin
+ else:
+ return self._pinyin + "%"
+ else:
+ if not self.is_valid_pinyin ():
+ if self._pinyin in ("zh", "ch", "sh"):
+ return self._pinyin[0] + "%"
+ return self._pinyin + "%"
+ else:
+ shengmu = self.get_shengmu ()
+ yunmu = self._pinyin [len (shengmu):]
+ if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
+ shengmu = shengmu[0] + "%"
+ if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
+ yunmu = yunmu[0:2] + "%"
+ return shengmu + yunmu
+
+ def split (self):
+ if not self.is_valid_pinyin ():
+ raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
+ if self._pinyin[:2] in SHENGMU_DICT.keys ():
+ return self._pinyin[:2], self._pinyin[2:]
+ elif self._pinyin[:1] in SHENGMU_DICT.keys ():
+ return self._pinyin[:1], self._pinyin[1:]
+ else:
+ return "", self._pinyin[:]
+
+ def __str__ (self):
+ return self._pinyin
+
+class PinYinString:
+ def __init__ (self, string):
+ pass
+
+def load_pinyin_table (_file):
+
+ def pinyin_table_parser (f):
+ for l in f:
+ a = unicode (l, "utf-8").strip ().split ()
+ hanzi, pinyin, freq = a
+ yield (hanzi, pinyin, int (freq))
+ # db.add_phrases (pinyin_table_parser (bzf))
+
+ hanzi_dic = {}
+ for hanzi, pinyin, freq in pinyin_table_parser (_file):
+ if not hanzi_dic.has_key (hanzi):
+ hanzi_dic[hanzi] = {}
+
+ if hanzi_dic[hanzi].has_key (pinyin):
+ hanzi_dic[hanzi][pinyin] += freq
+ else:
+ hanzi_dic[hanzi][pinyin] = freq
+
+ return hanzi_dic
+
+def load_phrase_pinyin_freq (_file):
+ def phrase_pinyin_parser (f):
+ for l in f:
+ phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
+ pinyin = pinyin.replace (u"u:", u"v")
+ yield (phrase, pinyin, int (freq))
+ phrases_dic = {}
+ for phrase, pinyin, freq in phrase_pinyin_parser (_file):
+ if not phrases_dic.has_key (phrase):
+ phrases_dic[phrase] = []
+ phrases_dic[phrase].append ((phrase, pinyin, freq))
+
+ return phrases_dic
+
+def load_phrase_pinyin (_file):
+ def phrase_pinyin_parser (f):
+ for l in f:
+ phrase, pinyin = unicode (l, "utf-8").strip ().split ()
+ pinyin = pinyin.replace (u"u:", u"v")
+ yield (phrase, pinyin, 0)
+ phrases_dic = {}
+ for phrase, pinyin, freq in phrase_pinyin_parser (_file):
+ if not phrases_dic.has_key (phrase):
+ phrases_dic[phrase] = []
+ phrases_dic[phrase].append ((phrase, pinyin, freq))
+
+ return phrases_dic
+
+def load_sogou_phrases (_file):
+ import re
+ dic = {}
+ for l in _file:
+ w = unicode (l, "utf8")
+ w = re.split (ur"\t+", w)
+ dic [w[0]] = (w[0], int (w[1]))
+ return dic
+