OpenCompass/opencompass/datasets/lawbench/utils/char_smi.py

456 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### Copy from https://github.com/iqiyi/FASPell ###
"""
Requirements:
- java (required only if tree edit distance is used)
- numpy
"""
import numpy as np
from subprocess import Popen, PIPE, STDOUT
import os
import argparse
IDCS = {'\u2ff0': 2, # 12 ideographic description characters and their capacity of son nodes
'\u2ff1': 2,
'\u2ff2': 3,
'\u2ff3': 3,
'\u2ff4': 2,
'\u2ff5': 2,
'\u2ff6': 2,
'\u2ff7': 2,
'\u2ff8': 2,
'\u2ff9': 2,
'\u2ffa': 2,
'\u2ffb': 2, }
PINYIN = {'ā': ['a', 1], 'á': ['a', 2], 'ǎ': ['a', 3], 'à': ['a', 4],
'ē': ['e', 1], 'é': ['e', 2], 'ě': ['e', 3], 'è': ['e', 4],
'ī': ['i', 1], 'í': ['i', 2], 'ǐ': ['i', 3], 'ì': ['i', 4],
'ō': ['o', 1], 'ó': ['o', 2], 'ǒ': ['o', 3], 'ò': ['o', 4],
'ū': ['u', 1], 'ú': ['u', 2], 'ǔ': ['u', 3], 'ù': ['u', 4],
'ǖ': ['ü', 1], 'ǘ': ['ü', 2], 'ǚ': ['ü', 3], 'ǜ': ['ü', 4],
'': ['m', 2], 'ń': ['n', 2], 'ň': ['n', 3], 'ǹ': ['n', 4],
}
# APTED_JAR_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'apted.jar')
APTED_JAR_PATH = 'apted.jar'
def tree_edit_distance(tree_a, tree_b):
"""
We use APTED algorithm proposed by M. Pawlik and N. Augsten
github link: https://github.com/DatabaseGroup/apted
"""
p = Popen(['java', '-jar', APTED_JAR_PATH, '-t', tree_a, tree_b], stdout=PIPE, stderr=STDOUT)
res = [line for line in p.stdout]
res = res[0]
res = res.strip()
res = float(res)
return res
def edit_distance(string_a, string_b, name='Levenshtein'):
"""
>>> edit_distance('abcde', 'avbcude')
2
>>> edit_distance(['', ''], ['', '', ''])
1
>>> edit_distance('fang', 'qwe')
4
>>> edit_distance('fang', 'hen')
3
"""
size_x = len(string_a) + 1
size_y = len(string_b) + 1
matrix = np.zeros((size_x, size_y), dtype=int)
for x in range(size_x):
matrix[x, 0] = x
for y in range(size_y):
matrix[0, y] = y
for x in range(1, size_x):
for y in range(1, size_y):
if string_a[x - 1] == string_b[y - 1]:
matrix[x, y] = min(
matrix[x - 1, y] + 1,
matrix[x - 1, y - 1],
matrix[x, y - 1] + 1
)
else:
if name == 'Levenshtein':
matrix[x, y] = min(
matrix[x - 1, y] + 1,
matrix[x - 1, y - 1] + 1,
matrix[x, y - 1] + 1
)
else: # Canonical
matrix[x, y] = min(
matrix[x - 1, y] + 1,
matrix[x - 1, y - 1] + 2,
matrix[x, y - 1] + 1
)
return matrix[size_x - 1, size_y - 1]
class CharFuncs(object):
def __init__(self, char_meta_fname):
self.data = self.load_char_meta(char_meta_fname)
self.char_dict = dict([(c, 0) for c in self.data])
self.safe = {'\u2ff0': 'A',
# to eliminate the bug that, in Windows CMD, char ⿻ and ⿵ are encoded to be the same.
'\u2ff1': 'B',
'\u2ff2': 'C',
'\u2ff3': 'D',
'\u2ff4': 'E',
'\u2ff5': 'F',
'\u2ff6': 'G',
'\u2ff7': 'H',
'\u2ff8': 'I',
'\u2ff9': 'J',
'\u2ffa': 'L',
'\u2ffb': 'M', }
@staticmethod
def load_char_meta(fname):
data = {}
f = open(fname, 'r', encoding='utf-8')
for line in f:
items = line.strip().split('\t')
code_point = items[0]
char = items[1]
pronunciation = items[2]
decompositions = items[3:]
assert char not in data
data[char] = {"code_point": code_point, "pronunciation": pronunciation, "decompositions": decompositions}
return data
def shape_distance(self, char1, char2, safe=True, as_tree=False):
"""
>>> c = CharFuncs('data/char_meta.txt')
>>> c.shape_distance('', '')
1
>>> c.shape_distance('', '')
1
"""
assert char1 in self.data
assert char2 in self.data
def safe_encode(decomp):
tree = ''
for c in string_to_tree(decomp):
if c not in self.safe:
tree += c
else:
tree += self.safe[c]
return tree
def safe_encode_string(decomp):
tree = ''
for c in decomp:
if c not in self.safe:
tree += c
else:
tree += self.safe[c]
return tree
decomps_1 = self.data[char1]["decompositions"]
decomps_2 = self.data[char2]["decompositions"]
distance = 1e5
if as_tree:
for decomp1 in decomps_1:
for decomp2 in decomps_2:
if not safe:
ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2))
else:
ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2))
distance = min(distance, ted)
else:
for decomp1 in decomps_1:
for decomp2 in decomps_2:
if not safe:
ed = edit_distance(decomp1, decomp2)
else:
ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2))
distance = min(distance, ed)
return distance
def pronunciation_distance(self, char1, char2):
"""
>>> c = CharFuncs('data/char_meta.txt')
>>> c.pronunciation_distance('', '')
3.4
>>> c.pronunciation_distance('', '')
2.6
"""
assert char1 in self.data
assert char2 in self.data
pronunciations1 = self.data[char1]["pronunciation"]
pronunciations2 = self.data[char2]["pronunciation"]
if pronunciations1[0] == 'null' or pronunciations2 == 'null':
return 0.0
else:
pronunciations1 = pronunciations1.split(';') # separate by lan
pronunciations2 = pronunciations2.split(';') # separate by lan
distance = 0.0
count = 0
for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2):
if (pron_lan1 == 'null') or (pron_lan2 == 'null'):
pass
else:
distance_lan = 1e5
for p1 in pron_lan1.split(','):
for p2 in pron_lan2.split(','):
distance_lan = min(distance_lan, edit_distance(p1, p2))
distance += distance_lan
count += 1
return distance / count
@staticmethod
def load_dict(fname):
data = {}
f = open(fname, 'r', encoding='utf-8')
for line in f:
char, freq = line.strip().split('\t')
assert char not in data
data[char] = freq
return data
def similarity(self, char1, char2, weights=(0.8, 0.2, 0.0), as_tree=False):
"""
this function returns weighted similarity. When used in FASPell, each weight can only be 0 or 1.
"""
# assert char1 in self.char_dict
# assert char2 in self.char_dict
shape_w, sound_w, freq_w = weights
if char1 in self.char_dict and char2 in self.char_dict:
shape_sim = self.shape_similarity(char1, char2, as_tree=as_tree)
sound_sim = self.pronunciation_similarity(char1, char2)
freq_sim = 1.0 - self.char_dict[char2] / len(self.char_dict)
return shape_sim * shape_w + sound_sim * sound_w + freq_sim * freq_w
else:
return 0.0
def shape_similarity(self, char1, char2, safe=True, as_tree=False):
"""
>>> c = CharFuncs('data/char_meta.txt')
>>> c.shape_similarity('', '')
0.8571428571428572
>>> c.shape_similarity('', '')
0.8888888888888888
"""
assert char1 in self.data
assert char2 in self.data
def safe_encode(decomp):
tree = ''
for c in string_to_tree(decomp):
if c not in self.safe:
tree += c
else:
tree += self.safe[c]
return tree
def safe_encode_string(decomp):
tree = ''
for c in decomp:
if c not in self.safe:
tree += c
else:
tree += self.safe[c]
return tree
decomps_1 = self.data[char1]["decompositions"]
decomps_2 = self.data[char2]["decompositions"]
similarity = 0.0
if as_tree:
for decomp1 in decomps_1:
for decomp2 in decomps_2:
if not safe:
ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2))
else:
ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2))
normalized_ted = 2 * ted / (len(decomp1) + len(decomp2) + ted)
similarity = max(similarity, 1 - normalized_ted)
else:
for decomp1 in decomps_1:
for decomp2 in decomps_2:
if not safe:
ed = edit_distance(decomp1, decomp2)
else:
ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2))
normalized_ed = ed / max(len(decomp1), len(decomp2))
similarity = max(similarity, 1 - normalized_ed)
return similarity
def pronunciation_similarity(self, char1, char2):
"""
>>> c = CharFuncs('data/char_meta.txt')
>>> c.pronunciation_similarity('', '')
0.27999999999999997
>>> c.pronunciation_similarity('', '')
0.09
"""
assert char1 in self.data
assert char2 in self.data
pronunciations1 = self.data[char1]["pronunciation"]
pronunciations2 = self.data[char2]["pronunciation"]
if pronunciations1[0] == 'null' or pronunciations2 == 'null':
return 0.0
else:
pronunciations1 = pronunciations1.split(';') # separate by lan
pronunciations2 = pronunciations2.split(';') # separate by lan
similarity = 0.0
count = 0
for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2):
if (pron_lan1 == 'null') or (pron_lan2 == 'null'):
pass
else:
similarity_lan = 0.0
for p1 in pron_lan1.split(','):
for p2 in pron_lan2.split(','):
tmp_sim = 1 - edit_distance(p1, p2) / max(len(p1), len(p2))
similarity_lan = max(similarity_lan, tmp_sim)
similarity += similarity_lan
count += 1
return similarity / count if count else 0
def string_to_tree(string):
"""
This function converts ids string to a string that can be used as a tree input to APTED.
Any Error raised by this function implies that the input string is invalid.
>>> string_to_tree('⿱⿱⿰丿㇏⿰丿㇏⿱⿰丿㇏⿰丿㇏') # 炎
'{{{{丿}{㇏}}{{丿}{㇏}}}{{{丿}{㇏}}{{丿}{㇏}}}}'
>>> string_to_tree('⿱⿰丿㇏⿱一⿱⿻一丨一') # 全
'{{{丿}{㇏}}{{一}{{{一}{丨}}{一}}}}'
>>> string_to_tree('⿱⿰丿㇏⿻⿱一⿱⿻一丨一丷') # 金
'{{{丿}{㇏}}{{{一}{{{一}{丨}}{一}}}{丷}}}'
>>> string_to_tree('⿻⿻⿻一丨一⿴⿱⿰丨𠃌一一') # 車
'{{{{一}{丨}}{一}}{{{{丨}{𠃌}}{一}}{一}}}'
>>> string_to_tree('⿻⿻⿻一丨⿰丿㇏⿴⿱⿰丨𠃌一一') # 東
'{{{{一}{丨}}{{丿}{㇏}}}{{{{丨}{𠃌}}{一}}{一}}}'
>>> string_to_tree('丿') # 丿
'{丿}'
>>> string_to_tree('') # ⿻
'{⿻}'
"""
if string[0] in IDCS and len(string) != 1:
bracket_stack = []
tree = []
def add_brackets(num):
if num == 2:
bracket_stack.extend(['}', '{', '}'])
else:
bracket_stack.extend(['}', '{', '}', '{', '}'])
tree.append('{')
global_just_put = '{'
for c in string:
tree.append(c)
if c in IDCS:
assert global_just_put != '}'
add_brackets(IDCS[c])
global_just_put = '{'
else:
just_put = ''
while just_put != '{' and bracket_stack:
just_put = bracket_stack.pop(-1)
tree.append(just_put)
global_just_put = just_put
res = ''.join(tree)
assert res[-1] == '}'
else:
assert len(string) == 1 or string == 'null'
res = string[0]
return '{' + res + '}'
def pinyin_map(standard_pinyin):
"""
>>> pinyin_map('xuě')
'xue3'
>>> pinyin_map('xue')
'xue'
>>> pinyin_map('')
'lü4'
>>> pinyin_map('')
'fa2'
"""
tone = ''
pinyin = ''
assert ' ' not in standard_pinyin
for c in standard_pinyin:
if c in PINYIN:
pinyin += PINYIN[c][0]
assert tone == ''
tone = str(PINYIN[c][1])
else:
pinyin += c
pinyin += tone
return pinyin
def parse_args():
usage = '\n1. You can compute character similarity by:\n' \
'python char_sim.py 午 牛 年 千\n' \
'\n' \
'2. You can use ted in computing character similarity by:\n' \
'python char_sim.py 午 牛 年 千 -t\n' \
'\n'
parser = argparse.ArgumentParser(
description='A script to compute Chinese character (Kanji) similarity', usage=usage)
parser.add_argument('multiargs', nargs='*', type=str, default=None,
help='Chinese characters in question')
parser.add_argument('--ted', '-t', action="store_true", default=False,
help='True=to use tree edit distence (TED)'
'False=to use string edit distance')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
c = CharFuncs('data/char_meta.txt')
if not args.ted:
for i, c1 in enumerate(args.multiargs):
for c2 in args.multiargs[i:]:
if c1 != c2:
print(f'For character pair ({c1}, {c2}):')
print(f' v-sim = {c.shape_similarity(c1, c2)}')
print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n')
else:
for i, c1 in enumerate(args.multiargs):
for c2 in args.multiargs[i:]:
if c1 != c2:
print(f'For character pair ({c1}, {c2}):')
print(f' v-sim = {c.shape_similarity(c1, c2, as_tree=True)}')
print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n')