### Copy from https://github.com/iqiyi/FASPell ### """ Requirements: - java (required only if tree edit distance is used) - numpy """ import numpy as np from subprocess import Popen, PIPE, STDOUT import os import argparse IDCS = {'\u2ff0': 2, # 12 ideographic description characters and their capacity of son nodes '\u2ff1': 2, '\u2ff2': 3, '\u2ff3': 3, '\u2ff4': 2, '\u2ff5': 2, '\u2ff6': 2, '\u2ff7': 2, '\u2ff8': 2, '\u2ff9': 2, '\u2ffa': 2, '\u2ffb': 2, } PINYIN = {'ā': ['a', 1], 'á': ['a', 2], 'ǎ': ['a', 3], 'à': ['a', 4], 'ē': ['e', 1], 'é': ['e', 2], 'ě': ['e', 3], 'è': ['e', 4], 'ī': ['i', 1], 'í': ['i', 2], 'ǐ': ['i', 3], 'ì': ['i', 4], 'ō': ['o', 1], 'ó': ['o', 2], 'ǒ': ['o', 3], 'ò': ['o', 4], 'ū': ['u', 1], 'ú': ['u', 2], 'ǔ': ['u', 3], 'ù': ['u', 4], 'ǖ': ['ü', 1], 'ǘ': ['ü', 2], 'ǚ': ['ü', 3], 'ǜ': ['ü', 4], '': ['m', 2], 'ń': ['n', 2], 'ň': ['n', 3], 'ǹ': ['n', 4], } # APTED_JAR_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'apted.jar') APTED_JAR_PATH = 'apted.jar' def tree_edit_distance(tree_a, tree_b): """ We use APTED algorithm proposed by M. Pawlik and N. Augsten github link: https://github.com/DatabaseGroup/apted """ p = Popen(['java', '-jar', APTED_JAR_PATH, '-t', tree_a, tree_b], stdout=PIPE, stderr=STDOUT) res = [line for line in p.stdout] res = res[0] res = res.strip() res = float(res) return res def edit_distance(string_a, string_b, name='Levenshtein'): """ >>> edit_distance('abcde', 'avbcude') 2 >>> edit_distance(['至', '刂'], ['亻', '至', '刂']) 1 >>> edit_distance('fang', 'qwe') 4 >>> edit_distance('fang', 'hen') 3 """ size_x = len(string_a) + 1 size_y = len(string_b) + 1 matrix = np.zeros((size_x, size_y), dtype=int) for x in range(size_x): matrix[x, 0] = x for y in range(size_y): matrix[0, y] = y for x in range(1, size_x): for y in range(1, size_y): if string_a[x - 1] == string_b[y - 1]: matrix[x, y] = min( matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1 ) else: if name == 'Levenshtein': matrix[x, y] = min( matrix[x - 1, y] + 1, matrix[x - 1, y - 1] + 1, matrix[x, y - 1] + 1 ) else: # Canonical matrix[x, y] = min( matrix[x - 1, y] + 1, matrix[x - 1, y - 1] + 2, matrix[x, y - 1] + 1 ) return matrix[size_x - 1, size_y - 1] class CharFuncs(object): def __init__(self, char_meta_fname): self.data = self.load_char_meta(char_meta_fname) self.char_dict = dict([(c, 0) for c in self.data]) self.safe = {'\u2ff0': 'A', # to eliminate the bug that, in Windows CMD, char ⿻ and ⿵ are encoded to be the same. '\u2ff1': 'B', '\u2ff2': 'C', '\u2ff3': 'D', '\u2ff4': 'E', '\u2ff5': 'F', '\u2ff6': 'G', '\u2ff7': 'H', '\u2ff8': 'I', '\u2ff9': 'J', '\u2ffa': 'L', '\u2ffb': 'M', } @staticmethod def load_char_meta(fname): data = {} f = open(fname, 'r', encoding='utf-8') for line in f: items = line.strip().split('\t') code_point = items[0] char = items[1] pronunciation = items[2] decompositions = items[3:] assert char not in data data[char] = {"code_point": code_point, "pronunciation": pronunciation, "decompositions": decompositions} return data def shape_distance(self, char1, char2, safe=True, as_tree=False): """ >>> c = CharFuncs('data/char_meta.txt') >>> c.shape_distance('田', '由') 1 >>> c.shape_distance('牛', '午') 1 """ assert char1 in self.data assert char2 in self.data def safe_encode(decomp): tree = '' for c in string_to_tree(decomp): if c not in self.safe: tree += c else: tree += self.safe[c] return tree def safe_encode_string(decomp): tree = '' for c in decomp: if c not in self.safe: tree += c else: tree += self.safe[c] return tree decomps_1 = self.data[char1]["decompositions"] decomps_2 = self.data[char2]["decompositions"] distance = 1e5 if as_tree: for decomp1 in decomps_1: for decomp2 in decomps_2: if not safe: ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) else: ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) distance = min(distance, ted) else: for decomp1 in decomps_1: for decomp2 in decomps_2: if not safe: ed = edit_distance(decomp1, decomp2) else: ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) distance = min(distance, ed) return distance def pronunciation_distance(self, char1, char2): """ >>> c = CharFuncs('data/char_meta.txt') >>> c.pronunciation_distance('田', '由') 3.4 >>> c.pronunciation_distance('牛', '午') 2.6 """ assert char1 in self.data assert char2 in self.data pronunciations1 = self.data[char1]["pronunciation"] pronunciations2 = self.data[char2]["pronunciation"] if pronunciations1[0] == 'null' or pronunciations2 == 'null': return 0.0 else: pronunciations1 = pronunciations1.split(';') # separate by lan pronunciations2 = pronunciations2.split(';') # separate by lan distance = 0.0 count = 0 for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): if (pron_lan1 == 'null') or (pron_lan2 == 'null'): pass else: distance_lan = 1e5 for p1 in pron_lan1.split(','): for p2 in pron_lan2.split(','): distance_lan = min(distance_lan, edit_distance(p1, p2)) distance += distance_lan count += 1 return distance / count @staticmethod def load_dict(fname): data = {} f = open(fname, 'r', encoding='utf-8') for line in f: char, freq = line.strip().split('\t') assert char not in data data[char] = freq return data def similarity(self, char1, char2, weights=(0.8, 0.2, 0.0), as_tree=False): """ this function returns weighted similarity. When used in FASPell, each weight can only be 0 or 1. """ # assert char1 in self.char_dict # assert char2 in self.char_dict shape_w, sound_w, freq_w = weights if char1 in self.char_dict and char2 in self.char_dict: shape_sim = self.shape_similarity(char1, char2, as_tree=as_tree) sound_sim = self.pronunciation_similarity(char1, char2) freq_sim = 1.0 - self.char_dict[char2] / len(self.char_dict) return shape_sim * shape_w + sound_sim * sound_w + freq_sim * freq_w else: return 0.0 def shape_similarity(self, char1, char2, safe=True, as_tree=False): """ >>> c = CharFuncs('data/char_meta.txt') >>> c.shape_similarity('牛', '午') 0.8571428571428572 >>> c.shape_similarity('田', '由') 0.8888888888888888 """ assert char1 in self.data assert char2 in self.data def safe_encode(decomp): tree = '' for c in string_to_tree(decomp): if c not in self.safe: tree += c else: tree += self.safe[c] return tree def safe_encode_string(decomp): tree = '' for c in decomp: if c not in self.safe: tree += c else: tree += self.safe[c] return tree decomps_1 = self.data[char1]["decompositions"] decomps_2 = self.data[char2]["decompositions"] similarity = 0.0 if as_tree: for decomp1 in decomps_1: for decomp2 in decomps_2: if not safe: ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) else: ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) normalized_ted = 2 * ted / (len(decomp1) + len(decomp2) + ted) similarity = max(similarity, 1 - normalized_ted) else: for decomp1 in decomps_1: for decomp2 in decomps_2: if not safe: ed = edit_distance(decomp1, decomp2) else: ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) normalized_ed = ed / max(len(decomp1), len(decomp2)) similarity = max(similarity, 1 - normalized_ed) return similarity def pronunciation_similarity(self, char1, char2): """ >>> c = CharFuncs('data/char_meta.txt') >>> c.pronunciation_similarity('牛', '午') 0.27999999999999997 >>> c.pronunciation_similarity('由', '田') 0.09 """ assert char1 in self.data assert char2 in self.data pronunciations1 = self.data[char1]["pronunciation"] pronunciations2 = self.data[char2]["pronunciation"] if pronunciations1[0] == 'null' or pronunciations2 == 'null': return 0.0 else: pronunciations1 = pronunciations1.split(';') # separate by lan pronunciations2 = pronunciations2.split(';') # separate by lan similarity = 0.0 count = 0 for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): if (pron_lan1 == 'null') or (pron_lan2 == 'null'): pass else: similarity_lan = 0.0 for p1 in pron_lan1.split(','): for p2 in pron_lan2.split(','): tmp_sim = 1 - edit_distance(p1, p2) / max(len(p1), len(p2)) similarity_lan = max(similarity_lan, tmp_sim) similarity += similarity_lan count += 1 return similarity / count if count else 0 def string_to_tree(string): """ This function converts ids string to a string that can be used as a tree input to APTED. Any Error raised by this function implies that the input string is invalid. >>> string_to_tree('⿱⿱⿰丿㇏⿰丿㇏⿱⿰丿㇏⿰丿㇏') # 炎 '{⿱{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}}' >>> string_to_tree('⿱⿰丿㇏⿱一⿱⿻一丨一') # 全 '{⿱{⿰{丿}{㇏}}{⿱{一}{⿱{⿻{一}{丨}}{一}}}}' >>> string_to_tree('⿱⿰丿㇏⿻⿱一⿱⿻一丨一丷') # 金 '{⿱{⿰{丿}{㇏}}{⿻{⿱{一}{⿱{⿻{一}{丨}}{一}}}{丷}}}' >>> string_to_tree('⿻⿻⿻一丨一⿴⿱⿰丨𠃌一一') # 車 '{⿻{⿻{⿻{一}{丨}}{一}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' >>> string_to_tree('⿻⿻⿻一丨⿰丿㇏⿴⿱⿰丨𠃌一一') # 東 '{⿻{⿻{⿻{一}{丨}}{⿰{丿}{㇏}}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' >>> string_to_tree('丿') # 丿 '{丿}' >>> string_to_tree('⿻') # ⿻ '{⿻}' """ if string[0] in IDCS and len(string) != 1: bracket_stack = [] tree = [] def add_brackets(num): if num == 2: bracket_stack.extend(['}', '{', '}']) else: bracket_stack.extend(['}', '{', '}', '{', '}']) tree.append('{') global_just_put = '{' for c in string: tree.append(c) if c in IDCS: assert global_just_put != '}' add_brackets(IDCS[c]) global_just_put = '{' else: just_put = '' while just_put != '{' and bracket_stack: just_put = bracket_stack.pop(-1) tree.append(just_put) global_just_put = just_put res = ''.join(tree) assert res[-1] == '}' else: assert len(string) == 1 or string == 'null' res = string[0] return '{' + res + '}' def pinyin_map(standard_pinyin): """ >>> pinyin_map('xuě') 'xue3' >>> pinyin_map('xue') 'xue' >>> pinyin_map('lǜ') 'lü4' >>> pinyin_map('fá') 'fa2' """ tone = '' pinyin = '' assert ' ' not in standard_pinyin for c in standard_pinyin: if c in PINYIN: pinyin += PINYIN[c][0] assert tone == '' tone = str(PINYIN[c][1]) else: pinyin += c pinyin += tone return pinyin def parse_args(): usage = '\n1. You can compute character similarity by:\n' \ 'python char_sim.py 午 牛 年 千\n' \ '\n' \ '2. You can use ted in computing character similarity by:\n' \ 'python char_sim.py 午 牛 年 千 -t\n' \ '\n' parser = argparse.ArgumentParser( description='A script to compute Chinese character (Kanji) similarity', usage=usage) parser.add_argument('multiargs', nargs='*', type=str, default=None, help='Chinese characters in question') parser.add_argument('--ted', '-t', action="store_true", default=False, help='True=to use tree edit distence (TED)' 'False=to use string edit distance') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() c = CharFuncs('data/char_meta.txt') if not args.ted: for i, c1 in enumerate(args.multiargs): for c2 in args.multiargs[i:]: if c1 != c2: print(f'For character pair ({c1}, {c2}):') print(f' v-sim = {c.shape_similarity(c1, c2)}') print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n') else: for i, c1 in enumerate(args.multiargs): for c2 in args.multiargs[i:]: if c1 != c2: print(f'For character pair ({c1}, {c2}):') print(f' v-sim = {c.shape_similarity(c1, c2, as_tree=True)}') print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n')