OpenCompass/opencompass/datasets/korbench/korbench_utils.py

700 lines
23 KiB
Python
Raw Normal View History

import json
import os
import re
import sympy as sp
import yaml
from sympy.parsing.latex import parse_latex
def load_yaml(yaml_path):
"""Load a YAML file."""
if not os.path.exists(yaml_path):
raise FileNotFoundError(f'YAML file not found: {yaml_path}')
with open(yaml_path, 'r', encoding='utf-8') as file:
return yaml.safe_load(file)
def load_json_or_jsonl(file_path):
"""Load data from a JSON or JSONL file."""
if not os.path.exists(file_path):
return None
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
return json.load(file)
elif file_path.endswith('.jsonl'):
return [json.loads(line) for line in file]
return None
def find_file(base_path, sub_path, extensions=('json', 'jsonl')):
"""Find the first available file with given extensions."""
for ext in extensions:
file_path = os.path.join(base_path, f'{sub_path}.{ext}')
if os.path.exists(file_path):
return file_path
return None
def load_json_or_jsonl_with_idx(data_path, split='', idx=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if idx is not None:
try:
return next(item for item in data if item.get('idx') == idx)
except StopIteration:
raise ValueError(f'No entry found for idx {idx}')
else:
return data
def load_split_data(base_path, split_name):
"""Load the rule and sample data for a specific split."""
split_path = os.path.join(base_path, split_name)
rule_path = find_file(split_path, 'rule')
sample_path = find_file(split_path, 'sample')
rules = load_json_or_jsonl(rule_path) if rule_path else []
samples = load_json_or_jsonl(sample_path) if sample_path else []
return {'rules': rules, 'samples': samples}
def process_mixed_data(base_path, mode):
"""Load and process data for the 'mixed' split and specific mode."""
mixed_path = os.path.join(base_path, 'mixed')
file_path = find_file(mixed_path, mode)
if not file_path:
print(f'[WARNING] Missing file for mixed mode: {mode}')
return []
data = load_json_or_jsonl(file_path)
template_path = os.path.join(base_path, 'config/prompt/mixed.yaml')
template = load_yaml(template_path)
processed = []
for item in data:
rules = '\n'.join(item.get('rule_list', []))
questions = '\n'.join(item.get('question_list', []))
item['prompt'] = template['prompt_format'][0].format(rules, questions)
processed.append(item)
return processed
class ConfigWrapper:
def __init__(self, config_path):
self._config = {}
with open(config_path, 'r') as file:
self._config = yaml.safe_load(file)
for key, value in self._config.items():
setattr(self, key, value)
def __setattr__(self, key, value):
if key.startswith('_'):
super().__setattr__(key, value)
else:
self._config[key] = value
super().__setattr__(key, value)
def __getattr__(self, key):
if key in self._config:
return self._config[key]
raise AttributeError(
f"'ConfigWrapper' object has no attribute '{key}'")
def get_id(self, data):
if isinstance(self._config.get('id_key'), str):
return data.get(self._config.get('id_key'), None)
elif isinstance(self._config.get('id_key'), list):
return '_'.join([
str(data[key]) for key in self._config.get('id_key')
if key in data
])
def print_all_keys(self):
print('config keys:')
for key, value in self._config.items():
print(f' - {key}: {value}')
config_wrapper = None
def initialize_config(config_path):
global config_wrapper
config_wrapper = ConfigWrapper(config_path)
def get_config_wrapper():
global config_wrapper
if config_wrapper is None:
raise RuntimeError(
'ConfigWrapper not initialized. Call initialize_config first.')
return config_wrapper
if __name__ == '__main__':
config_path = 'config/config.yaml'
initialize_config(config_path)
data = {
'idx':
'50',
'step':
21,
'question':
('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
'Please provide the decrypted answer, encapsulated in double '
'square brackets. '
'For example, the format should be: [[decrypted answer]].'),
'answer':
'[[P]]',
'category':
'Decryption',
'rule_id':
'23',
'input':
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
'steps_num':
23,
'description':
('For a number c=228 in the ciphertext:\n'
'Calculate z = c^e mod n. Here ^ means multiplication.\n'
'z is 80.\nBased on the decimal number represented by z, '
'use the ascii code to find the corresponding letter '
'as the plaintext letter p.\n'
'Please give the letter p in [[...]] format.\n'),
'atom':
80
}
print(config_wrapper.get_id(data))
def read_yaml(config='default'):
if os.path.exists(f'config/prompt/{config}.yaml'):
yaml_file = f'config/prompt/{config}.yaml'
else:
yaml_file = config
with open(yaml_file, 'r') as yaml_file:
return yaml.safe_load(yaml_file)
def write_jsonl_lines(file, data):
config_wrapper = get_config_wrapper()
if config_wrapper.save_prompt:
json.dump(data, file, ensure_ascii=False)
else:
data.pop(config_wrapper.prompt_key)
json.dump(data, file, ensure_ascii=False)
file.write('\n')
file.flush()
def print_info(info):
print('-' * 100)
print('[INFO] model_name:', info['model_name'])
print('[INFO] splits:', info['splits'])
print('[INFO] modes:', info['modes'])
print('[INFO] output_dir:', info['output_dir'])
print('[INFO] Infer Limit:',
'No limit' if info['infer_limit'] is None else info['infer_limit'])
print('[INFO] Number of Workers:', info['num_workers'])
print('[INFO] Batch Size:', info['batch_size'])
print('[INFO] Use Accel:', info['use_accel'])
print('-' * 100)
def read_json_or_jsonl(data_path, split='', mapping_key=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if mapping_key:
return {
item[mapping_key]: item
for item in data if mapping_key in item
}
else:
return data
def read_json_or_jsonl_with_idx(data_path, split='', idx=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if idx is not None:
try:
return next(item for item in data if item.get('idx') == idx)
except StopIteration:
raise ValueError(f'No entry found for idx {idx}')
else:
return data
idx_ranges = [
[18],
[73, 74, 77],
[94],
[115, 116, 117],
[121, 122, 123, 125],
[131, 132, 134, 135, 136],
[141, 143, 149],
list(range(145, 148)),
list(range(151, 157)),
[160, 161, 162],
[164, 165, 166],
[170],
[206, 209],
list(range(211, 216)),
[217, 218],
]
def clean_json_string(json_str):
json_str = re.sub(r'[\x00-\x1F\x7F]', '', json_str)
return json_str
def is_in_idx_ranges(idx, idx_ranges):
for range_list in idx_ranges:
if int(idx) in range_list:
return True
return False
def extract_json(text):
matches = re.findall(r'{.*}', text, re.DOTALL)
if matches:
json_str = matches[-1]
json_str = clean_json_string(json_str)
try:
data = json.loads(json_str)
return data
except json.JSONDecodeError as e:
print(f'Error decoding JSON: {e}')
return 'NULL'
return 'NULL'
def extract_all_responses_from_json(response_json):
results = []
for key, value in response_json.items():
results.append(str(value))
return results
def clean_latex(latex_expr):
if '=' in latex_expr:
latex_expr = latex_expr.rsplit('=', 1)[1]
latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr)
latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr)
latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr)
latex_expr = latex_expr.replace('\\\\', '\\')
return latex_expr
def extract_text_from_brackets(text, clean_level='basic'):
matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL)
if not matches:
matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL)
if not matches:
matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
if matches:
match_str = matches[0].strip()
if clean_level == 'clean':
match_str = match_str.replace('"', '').replace('\n', '').replace(
' ', '').replace('[', '').replace(']', '')
elif clean_level == 'logic':
match_str = match_str.replace('"', '').replace('\n', '').replace(
' ', '').replace('.', '')
elif clean_level == 'math':
match_str = match_str.replace('"', '').replace('\n', '').replace(
'[', '').replace(']', '').replace('$', '')
return f'{clean_latex(match_str)}'
return f'[[{match_str}]]'
return 'NULL'
def extract_inner_text_from_brackets(text):
if not isinstance(text, str):
print(f'text type: {type(text)}, text value: {text}')
return 'NULL'
match = re.search(r'\[\[(.*?)\]\]', text, re.DOTALL)
return match.group(1) if match else 'NULL'
def extract_numbers(str):
numbers = re.findall(r'\d+', str)
numbers = list(map(int, numbers))
return numbers
def extract_and_sort_inequalities(latex_expr):
pattern = r'(≥|≤)\s*([-]?\d+\.?\d*)'
matches = re.findall(pattern, latex_expr)
extracted_inequalities = [''.join(match) for match in matches]
sorted_inequalities = sorted(extracted_inequalities)
return sorted_inequalities
def rule5_normalize_content(content):
parts = [part for part in content.split(';')]
sorted_parts = sorted(parts)
return sorted_parts
def normalize_string(s):
s = re.sub(r'[^0-9]', '', s)
pairs = s.split(',')
pairs.sort()
return pairs
def remove_commas_and_spaces(s):
return re.sub(r'[,\s\[\]]+', '', s)
def remove_non_alphanumeric(s):
return re.sub(r'\W+', '', s)
def contains_or(answer):
return 'or' in answer
def compare_multi_results(response, answer):
try:
response_text = extract_text_from_brackets(response, 'clean')
response_text = re.sub(r'\\text\{or\}', 'or', response_text)
if response_text == 'NULL':
return False
answer = extract_text_from_brackets(answer, 'clean')
response_split = response_text.strip('[[]]').split('or')
answer_split = answer.strip('[[]]').split('or')
response_sorted = sorted([x.strip() for x in response_split])
answer_sorted = sorted([x.strip() for x in answer_split])
return response_sorted == answer_sorted
except Exception as e:
print(f'Error during comparison: {e}')
return False
def split_or_expression(expression):
return [part.strip() for part in expression.split('or')]
def compare_math_expressions(response, answer):
response_text = extract_text_from_brackets(response, 'math')
answer_text = extract_text_from_brackets(answer, 'math')
if response_text == 'NULL':
return False
if contains_or(answer_text):
response_parts = split_or_expression(response_text)
answer_parts = split_or_expression(answer_text)
try:
response_exprs = {
sp.simplify(parse_latex(part))
for part in response_parts
}
answer_exprs = {
sp.simplify(parse_latex(part))
for part in answer_parts
}
return response_exprs == answer_exprs
except Exception as e:
print(f'Error during simplification or parsing: {e}')
return response_text == answer_text
else:
try:
response_expr = sp.simplify(parse_latex(response_text))
answer_expr = sp.simplify(parse_latex(answer_text))
return response_expr == answer_expr
except Exception as e:
print(f'Error during simplification or parsing: {e}')
return response_text == answer_text
def method_equal(response_text, answer):
return response_text == answer
def method_1(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
answer = re.sub(r'[^A-Za-z]', '', answer)
answer = answer.lower()
return cleaned_string == answer
def method_2(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
answer = answer.split(',')
return cleaned_string in answer
def method_3(response_text, answer):
response_text = response_text.lower()
pairs1 = re.split(r'\W+', response_text)
pairs2 = answer.split(' ')
pairs1 = [word for word in pairs1 if word]
pairs1.sort()
pairs2.sort()
return pairs1 == pairs2
def method_4(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
return cleaned_string in answer
def method_5(response_text, answer):
response_text = re.sub(r'\s+', '', response_text)
response_text = response_text.split(',')
answer = answer.split(',')
response_text.sort()
answer.sort()
return response_text == answer
def method_9(response_text, answer):
response_text = response_text.replace('×', '*').replace('', '-')
answer = answer.replace('×', '*').replace('', '-')
def extract_operators(s):
return re.findall(r'[+\-*/]', s)
response_ops = extract_operators(response_text.split('=')[0])
answer_ops = extract_operators(answer.split('=')[0])
if response_ops != answer_ops:
return False
match = re.search(r'=\s*(-?\d+)', answer)
expected_result = int(match.group(1))
try:
left_side = response_text.split('=')[0]
result = eval(left_side)
except Exception as e:
print(f'Error during evaluation: {e}')
return False
return result == expected_result
def method_10(response_text, answer):
response_text = response_text.replace('×', '*').replace('', '-')
response_text = response_text.split('=')[0]
answer = answer.split('\n')[0].split('=')[0]
response_ops = sorted(remove_non_alphanumeric(response_text))
answer_ops = sorted(remove_non_alphanumeric(answer))
if response_ops != answer_ops:
return False
try:
result = eval(response_text)
except Exception as e:
print(f'Error during evaluation: {e}')
return False
return result == 24
def method_18(response_text, answer):
cleaned_s1 = remove_commas_and_spaces(response_text)
cleaned_s2 = remove_commas_and_spaces(answer)
return cleaned_s1 == cleaned_s2
def method_general(response_text, answer):
cleaned_s1 = remove_non_alphanumeric(response_text)
cleaned_s2 = remove_non_alphanumeric(answer)
return cleaned_s1 == cleaned_s2
question_methods = {
'1': method_1,
'2': method_2,
'3': method_3,
'4': method_4,
'5': method_5,
'9': method_9,
'10': method_10,
'18': method_18,
}
def evaluate_response_vs_answer(response, answer, question_type, rule_id, idx):
if question_type == 'logic' and rule_id == '5':
response_text = extract_text_from_brackets(response, 'logic')
answer_text = extract_text_from_brackets(answer, 'logic')
if response_text is None:
return False
normalized_response = rule5_normalize_content(response_text)
normalized_answer = rule5_normalize_content(answer)
return normalized_response == normalized_answer
elif question_type == 'logic':
response_text = extract_text_from_brackets(response, 'logic')
answer_text = extract_text_from_brackets(answer, 'logic')
return response_text == answer_text
elif question_type == 'operation' and (idx == '178' or idx == '179'):
response_text = extract_text_from_brackets(response, 'clean')
response_text = extract_and_sort_inequalities(response_text)
answer_text = extract_and_sort_inequalities(answer)
# print(response_text, answer_text)
return response_text == answer_text
elif question_type == 'operation' and rule_id == '18':
response_text = extract_text_from_brackets(response, 'clean')
answer = extract_inner_text_from_brackets(answer)
response_text = ''.join(sorted(re.sub(r'\W+', '', response_text)))
answer = ''.join(sorted(re.sub(r'\W+', '', answer)))
return response_text == answer
elif question_type == 'operation' and rule_id in {'23', '24', '25'}:
response_text = extract_text_from_brackets(response, 'clean')
if response_text is None:
return False
response_text = extract_numbers(response_text)
answer_text = extract_numbers(answer)
return response_text == answer_text
elif question_type == 'operation' and is_in_idx_ranges(idx, idx_ranges):
return compare_math_expressions(response, answer)
elif question_type == 'operation' and contains_or(answer):
return compare_multi_results(response, answer)
elif question_type == 'puzzle':
response_text = extract_inner_text_from_brackets(response)
answer = extract_inner_text_from_brackets(answer)
method = question_methods.get(rule_id)
if method:
return method(response_text, answer)
return method_general(response_text, answer)
else:
response_text = extract_text_from_brackets(response, 'clean')
return response_text == answer
def compute_one_mixed_question_pass_rate(idx,
question_list,
response_json,
base_path=None):
if response_json == 'NULL':
result_dict = {
'idx': idx,
'response': response_json,
'details': None,
'pass_rate': 0,
'is_correct': False
}
return result_dict
response_list = extract_all_responses_from_json(response_json)
correct_num = 0
results = []
for q_idx, question in enumerate(question_list):
category, question_idx = question.rsplit('_', 1)
question_content = load_json_or_jsonl_with_idx(base_path,
os.path.join(
category, 'sample'),
idx=question_idx)
answer = question_content['answer']
if q_idx >= len(response_list):
break
response = response_list[q_idx]
response_text = extract_text_from_brackets(response)
rule_id = question_content['rule_id']
is_correct = evaluate_response_vs_answer(response, answer, category,
rule_id, q_idx)
if is_correct:
correct_num += 1
results.append({
'question': question,
'response_text': response_text,
'answer': answer,
'is_correct': is_correct
})
pass_rate = correct_num / len(question_list)
question_correct = pass_rate == 1.0
result_dict = {
'idx': idx,
'response': response_json,
'details': results,
'pass_rate': pass_rate,
'is_correct': question_correct
}
return result_dict
def evaluate_responses(data, mode, base_path=None):
results = []
# Iterate over the values of the dictionary (numerical keys)
for key, record in data.items():
idx = key # Use the dictionary key as the "idx"
response = record.get('prediction', '')
question_type = record.get('category', '')
if mode == 'mixed':
question_list = record.get('question_list')
response_json = extract_json(response)
result_dict = compute_one_mixed_question_pass_rate(
idx, question_list, response_json, base_path)
results.append(result_dict)
else:
response_text = extract_text_from_brackets(response)
answer = record.get('gold', '')
rule_id = record.get('rule_id', '')
is_correct = evaluate_response_vs_answer(response, answer,
question_type, rule_id,
idx)
result_dict = {
'idx': idx,
'response': response,
'response_text': response_text,
'answer': answer,
'is_correct': is_correct
}
if question_type == 'counterfactual':
real_life_answer = record.get('real_life_answer', '')
is_real_life = evaluate_response_vs_answer(
response, real_life_answer, question_type, rule_id, idx)
result_dict['real_life_answer'] = real_life_answer
result_dict['is_real_life'] = is_real_life
if question_type == 'cipher' and mode == 'subquestions':
result_dict['type'] = record.get('type', '')
results.append(result_dict)
return results