mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* Update CascadeEvaluator * Update CascadeEvaluator * Update CascadeEvaluator * Update Config * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
249 lines
9.5 KiB
Python
249 lines
9.5 KiB
Python
import os
|
|
|
|
from datasets import Dataset
|
|
|
|
from opencompass.datasets.korbench.korbench_utils import (
|
|
evaluate_responses, find_file, load_json_or_jsonl,
|
|
load_json_or_jsonl_with_idx, load_yaml)
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
from ..base import BaseDataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class korbenchDataset(BaseDataset):
|
|
"""Dataset loader for the task in KOR-Bench."""
|
|
|
|
@staticmethod
|
|
def load(path, prompt_mode, category, **kwargs):
|
|
"""Load the dataset using shared ."""
|
|
base_path = get_data_path(path)
|
|
rule_file = None
|
|
sample_file = None
|
|
mixed_file = None
|
|
mixed_data = None
|
|
if '0_shot' in prompt_mode or '3_shot' in prompt_mode:
|
|
rule_file = find_file(base_path, os.path.join(category, 'rule'))
|
|
sample_file = find_file(base_path,
|
|
os.path.join(category, 'sample'))
|
|
elif prompt_mode == 'mixed':
|
|
mixed_file = find_file(base_path, os.path.join('mixed', category))
|
|
mixed_data = load_json_or_jsonl(mixed_file) or []
|
|
else:
|
|
raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
|
|
three_shot_file = None
|
|
if prompt_mode == '3_shot':
|
|
ts_path = os.path.join(category, 'three-shot')
|
|
three_shot_file = find_file(base_path, ts_path)
|
|
# Load data
|
|
if prompt_mode in ['0_shot', '3_shot']:
|
|
rules = load_json_or_jsonl(rule_file) or []
|
|
samples = load_json_or_jsonl(sample_file) or []
|
|
template_path = None
|
|
if prompt_mode == '0_shot':
|
|
template_path = os.path.join(
|
|
os.path.dirname(__file__),
|
|
'korbench_dataset_config/prompt/0_shot.yaml')
|
|
elif prompt_mode == '3_shot':
|
|
template_path = os.path.join(
|
|
os.path.dirname(__file__),
|
|
'korbench_dataset_config/prompt/3_shot.yaml')
|
|
elif prompt_mode == 'mixed':
|
|
template_path = os.path.join(
|
|
os.path.dirname(__file__),
|
|
'korbench_dataset_config/prompt/mixed.yaml')
|
|
try:
|
|
template = load_yaml(template_path)
|
|
except FileNotFoundError:
|
|
print(f'[ERROR] Missing prompt template: {template_path}')
|
|
return Dataset.from_list([])
|
|
|
|
# Process data
|
|
data = []
|
|
if prompt_mode == '0_shot':
|
|
for sample in samples:
|
|
rule_id = sample['rule_id']
|
|
rule = next((r for r in rules if r['idx'] == rule_id), None)
|
|
if not rule:
|
|
print(f"[WARNING] Rule ID {sample['rule_id']} not found."
|
|
'Skipping...')
|
|
continue
|
|
prompt_key = f'{category}_prompt_format'
|
|
prompt = template[prompt_key][0].format(
|
|
rule['rule_content'], sample['question'])
|
|
|
|
# Add processed item
|
|
data.append({
|
|
'rule_content': rule['rule_content'],
|
|
'question': sample['question'],
|
|
'answer': sample['answer'],
|
|
'prompt': prompt,
|
|
'rule_id': rule['idx'],
|
|
'prompt_mode': '0_shot',
|
|
'category': category,
|
|
})
|
|
|
|
return Dataset.from_list(data)
|
|
|
|
if prompt_mode == '3_shot':
|
|
data = []
|
|
three_shot = load_json_or_jsonl(three_shot_file) or []
|
|
for sample in samples:
|
|
rule_id = sample['rule_id']
|
|
rule = next((r for r in rules if r['idx'] == rule_id), None)
|
|
three_shot_qa = [
|
|
item for fs in three_shot if fs['rule_id'] == rule_id
|
|
for item in [fs['question'], fs['answer']]
|
|
]
|
|
if not rule:
|
|
print(f"[WARNING] Rule ID {sample['rule_id']} not found."
|
|
'Skipping...')
|
|
continue
|
|
prompt_key = f'{category}_prompt_format'
|
|
prompt = template[prompt_key][0].format(
|
|
rule['rule_content'], *three_shot_qa, sample['question'])
|
|
# Add processed item
|
|
data.append({
|
|
'rule_content': rule['rule_content'],
|
|
'question': sample['question'],
|
|
'answer': sample['answer'],
|
|
'prompt': prompt,
|
|
'rule_id': rule['idx'],
|
|
'prompt_mode': '3_shot',
|
|
'category': category,
|
|
})
|
|
|
|
return Dataset.from_list(data)
|
|
|
|
if prompt_mode == 'mixed':
|
|
# Process data
|
|
data = []
|
|
for item in mixed_data:
|
|
rule_list = item['rule_list']
|
|
question_list = item['question_list']
|
|
rule_content_list = []
|
|
question_content_list = []
|
|
|
|
# Fetch rules and questions
|
|
for rule in rule_list:
|
|
category, rule_idx = rule.rsplit('_', 1)
|
|
rule_content = load_json_or_jsonl_with_idx(base_path,
|
|
os.path.join(
|
|
category,
|
|
'rule'),
|
|
idx=rule_idx)
|
|
rule_content_list.append(rule_content['rule_content'])
|
|
|
|
for question in question_list:
|
|
category, question_idx = question.rsplit('_', 1)
|
|
question_content = load_json_or_jsonl_with_idx(
|
|
base_path,
|
|
os.path.join(category, 'sample'),
|
|
idx=question_idx)
|
|
question_content_list.append(question_content['question'])
|
|
|
|
# Prepare prompt
|
|
rules_str = '\n'.join(
|
|
f'Rule {i+1}: {content}'
|
|
for i, content in enumerate(rule_content_list))
|
|
questions_str = '\n'.join(
|
|
f'Question {i+1}: {content}'
|
|
for i, content in enumerate(question_content_list))
|
|
prompt_format = [rules_str, questions_str]
|
|
prompt = template['prompt_format'][0].format(*prompt_format)
|
|
|
|
# Add processed item
|
|
data.append({
|
|
'rule_list': rule_list,
|
|
'question_list': question_list,
|
|
'prompt': prompt,
|
|
'prompt_mode': 'mixed',
|
|
'answer': '',
|
|
'base_path': base_path,
|
|
})
|
|
|
|
return Dataset.from_list(data)
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class korbenchEvaluator(BaseEvaluator):
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def sample_score(self, prediction, reference, test_item=None):
|
|
"""Evaluate a single sample.
|
|
|
|
Args:
|
|
prediction: The model's prediction
|
|
reference: The reference answer
|
|
test_item: Additional information about the test sample
|
|
|
|
Returns:
|
|
Dict: A dictionary containing evaluation results
|
|
"""
|
|
if test_item is None:
|
|
raise ValueError('Test item is required.')
|
|
|
|
prompt_mode = test_item.get('prompt_mode')
|
|
|
|
# Build data for a single sample
|
|
entry = {
|
|
'prediction': prediction,
|
|
'gold': reference,
|
|
'rule_id': test_item.get('rule_id', None),
|
|
'category': test_item.get('category', None),
|
|
'rule_list': test_item.get('rule_list', None),
|
|
'question_list': test_item.get('question_list', None),
|
|
'base_path': test_item.get('base_path', None),
|
|
}
|
|
|
|
# Evaluate the single sample
|
|
data = {0: entry}
|
|
|
|
# Evaluate based on different prompt_mode
|
|
if prompt_mode == '0_shot':
|
|
evaluation_results = evaluate_responses(data, '0_shot')
|
|
elif prompt_mode == '3_shot':
|
|
evaluation_results = evaluate_responses(data, '3_shot')
|
|
elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
|
|
evaluation_results = evaluate_responses(data, 'mixed',
|
|
test_item.get('base_path'))
|
|
else:
|
|
return {
|
|
'is_correct': False,
|
|
'pred': prediction,
|
|
'answer': reference
|
|
}
|
|
|
|
# Return evaluation results
|
|
result = evaluation_results[0]
|
|
result['correct'] = result['is_correct']
|
|
result.update({'pred': prediction, 'answer': reference})
|
|
return result
|
|
|
|
def score(self, predictions, references, test_set):
|
|
"""Evaluate each sample using sample_score."""
|
|
if not test_set:
|
|
raise ValueError('Test set is empty.')
|
|
|
|
details = []
|
|
correct_count = 0
|
|
|
|
# Call sample_score for each sample
|
|
for i in range(len(predictions)):
|
|
result = self.sample_score(predictions[i], references[i],
|
|
test_set[i])
|
|
details.append(result)
|
|
if result.get('is_correct', False):
|
|
correct_count += 1
|
|
|
|
# Calculate accuracy
|
|
accuracy = (correct_count /
|
|
len(predictions)) * 100 if predictions else 0
|
|
|
|
# Return evaluation results
|
|
return {'accuracy': accuracy, 'details': details}
|