import extract_emoji.py

author: Peng Wu <alexepico@gmail.com> 2018-09-21 17:46:59 +0800
committer: Peng Wu <alexepico@gmail.com> 2018-09-21 17:46:59 +0800
commit: 4b6017c2149b1f1ca1f7ec3260310da9dbf40260 (patch)
tree: 1793f72d90e6bf9cd3e56a8f22f607280cf34e89
parent: 78c7636176951a8528dc9e2793d32a42680e434a (diff)
download: ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.tar.gz
ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.tar.xz
ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.zip
1 files changed, 146 insertions, 0 deletions
diff --git a/scripts/extract_emoji.py b/scripts/extract_emoji.py
new file mode 100644
index 0000000..2fc7d85
--- /dev/null
+++ b/scripts/extract_emoji.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python3
+import os
+import operator
+from argparse import ArgumentParser
+
+import xml.etree.ElementTree as ET
+
+header = '''/* This file is generated by python scripts. Don't edit this file directly.
+ */
+'''
+
+alphabet = "abcdefghijklmnopqrstuvwxyz"
+
+eng_emojis = []
+chs_emojis = []
+
+def load_emoji(filename):
+    tree = ET.parse(filename)
+    root = tree.getroot()
+
+    emojis = {}
+    for annotation in root.findall('.//annotation'):
+        for word in annotation.text.split('|'):
+            word = word.strip()
+
+            # only keep the first encountered emoji
+            if not word in emojis:
+                # print(annotation.get('cp'))
+                emojis[word] = annotation.get('cp')
+
+    return emojis
+
+# no space allowed for English emoji
+
+def filter_English_emoji(emojis):
+    emojis_copy = {}
+
+    for key, value in emojis.items():
+        if ' ' in key:
+            continue
+
+        if len(key) > 6:
+            continue
+
+        # only accept alphabet
+        isalphabet = True
+        for c in key:
+            if not c in alphabet:
+                isalphabet = False
+
+        if not isalphabet:
+            continue
+
+        #print(key, value)
+        emojis_copy[key] = value
+
+    return emojis_copy
+
+
+# less than four characters for Chinese emoji
+
+def filter_Chinese_emoji(emojis):
+    emojis_copy = {}
+
+    for key, value in emojis.items():
+        if len(key) > 2:
+            continue
+
+        # just reject alphabet
+        isalnum = False
+        for c in key:
+            if c.isdigit() or c in alphabet or c in alphabet.upper():
+                isalnum = True
+
+        if isalnum:
+            continue
+
+        #print(key, value)
+        emojis_copy[key] = value
+
+    return emojis_copy
+
+def prepare_emojis():
+    global eng_emojis, chs_emojis
+    eng_emojis = filter_English_emoji(load_emoji('en.xml'))
+    chs_emojis = filter_Chinese_emoji(load_emoji('yue_Hans.xml'))
+
+    eng_emojis = [(key, value) for key, value in eng_emojis.items()]
+    chs_emojis = [(key, value) for key, value in chs_emojis.items()]
+
+    compare = operator.itemgetter(0)
+    eng_emojis = sorted(eng_emojis, key=compare)
+    chs_emojis = sorted(chs_emojis, key=compare)
+
+
+def gen_english_emojis():
+    entries = []
+    for match, string in eng_emojis:
+        match = '"{0}"'.format(match)
+        entry = '{0:<10}, "{1}"'.format(match, string)
+        entries.append(entry)
+    return ',\n'.join(entries)
+
+def gen_chinese_emojis():
+    entries = []
+    for match, string in chs_emojis:
+        match = '"{0}"'.format(match)
+        entry = '{0:<10}, "{1}"'.format(match, string)
+        entries.append(entry)
+    return ',\n'.join(entries)
+
+
+def get_table_content(tablename):
+    # English Emojis
+    if tablename == 'ENGLISH_EMOJIS':
+        return gen_english_emojis()
+    # Chinese Emojis
+    if tablename == 'CHINESE_EMOJIS':
+        return gen_chinese_emojis()
+
+def expand_file(filename):
+    infile = open(filename, "r")
+    print(header)
+    for line in infile.readlines():
+        line = line.rstrip(os.linesep)
+        if len(line) < 3:
+            print(line)
+            continue
+        if line[0] == '@' and line[-1] == '@':
+            tablename = line[1:-1]
+            print(get_table_content(tablename))
+        else:
+            print(line)
+
+
+### main function ###
+if __name__ == "__main__":
+    parser = ArgumentParser(description='Generate header file from template.')
+    parser.add_argument('infile', action='store', \
+                        help='input file.')
+
+    args = parser.parse_args()
+    #print(args)
+
+    prepare_emojis()
+    expand_file(args.infile)
author	Peng Wu <alexepico@gmail.com>	2018-09-21 17:46:59 +0800
committer	Peng Wu <alexepico@gmail.com>	2018-09-21 17:46:59 +0800
commit	4b6017c2149b1f1ca1f7ec3260310da9dbf40260 (patch)
tree	1793f72d90e6bf9cd3e56a8f22f607280cf34e89
parent	78c7636176951a8528dc9e2793d32a42680e434a (diff)
download	ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.tar.gz ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.tar.xz ibus-libpinyin-4b6017c2149b1f1ca1f7ec3260310da9dbf40260.zip