import os from datasets import Dataset from opencompass.datasets.korbench.korbench_utils import ( evaluate_responses, find_file, load_json_or_jsonl, load_json_or_jsonl_with_idx, load_yaml) from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.utils import get_data_path from ..base import BaseDataset @LOAD_DATASET.register_module() class korbenchDataset(BaseDataset): """Dataset loader for the task in KOR-Bench.""" @staticmethod def load(path, prompt_mode, category, **kwargs): """Load the dataset using shared .""" base_path = get_data_path(path) rule_file = None sample_file = None mixed_file = None mixed_data = None if '0_shot' in prompt_mode or '3_shot' in prompt_mode: rule_file = find_file(base_path, os.path.join(category, 'rule')) sample_file = find_file(base_path, os.path.join(category, 'sample')) elif prompt_mode == 'mixed': mixed_file = find_file(base_path, os.path.join('mixed', category)) mixed_data = load_json_or_jsonl(mixed_file) or [] else: raise ValueError(f'Unsupported prompt_mode: {prompt_mode}') three_shot_file = None if prompt_mode == '3_shot': ts_path = os.path.join(category, 'three-shot') three_shot_file = find_file(base_path, ts_path) # Load data if prompt_mode in ['0_shot', '3_shot']: rules = load_json_or_jsonl(rule_file) or [] samples = load_json_or_jsonl(sample_file) or [] template_path = None if prompt_mode == '0_shot': template_path = os.path.join( os.path.dirname(__file__), 'korbench_dataset_config/prompt/0_shot.yaml') elif prompt_mode == '3_shot': template_path = os.path.join( os.path.dirname(__file__), 'korbench_dataset_config/prompt/3_shot.yaml') elif prompt_mode == 'mixed': template_path = os.path.join( os.path.dirname(__file__), 'korbench_dataset_config/prompt/mixed.yaml') try: template = load_yaml(template_path) except FileNotFoundError: print(f'[ERROR] Missing prompt template: {template_path}') return Dataset.from_list([]) # Process data data = [] if prompt_mode == '0_shot': for sample in samples: rule_id = sample['rule_id'] rule = next((r for r in rules if r['idx'] == rule_id), None) if not rule: print(f"[WARNING] Rule ID {sample['rule_id']} not found." 'Skipping...') continue prompt_key = f'{category}_prompt_format' prompt = template[prompt_key][0].format( rule['rule_content'], sample['question']) # Add processed item data.append({ 'rule_content': rule['rule_content'], 'question': sample['question'], 'answer': sample['answer'], 'prompt': prompt, 'rule_id': rule['idx'], 'prompt_mode': '0_shot', 'category': category, }) return Dataset.from_list(data) if prompt_mode == '3_shot': data = [] three_shot = load_json_or_jsonl(three_shot_file) or [] for sample in samples: rule_id = sample['rule_id'] rule = next((r for r in rules if r['idx'] == rule_id), None) three_shot_qa = [ item for fs in three_shot if fs['rule_id'] == rule_id for item in [fs['question'], fs['answer']] ] if not rule: print(f"[WARNING] Rule ID {sample['rule_id']} not found." 'Skipping...') continue prompt_key = f'{category}_prompt_format' prompt = template[prompt_key][0].format( rule['rule_content'], *three_shot_qa, sample['question']) # Add processed item data.append({ 'rule_content': rule['rule_content'], 'question': sample['question'], 'answer': sample['answer'], 'prompt': prompt, 'rule_id': rule['idx'], 'prompt_mode': '3_shot', 'category': category, }) return Dataset.from_list(data) if prompt_mode == 'mixed': # Process data data = [] for item in mixed_data: rule_list = item['rule_list'] question_list = item['question_list'] rule_content_list = [] question_content_list = [] # Fetch rules and questions for rule in rule_list: category, rule_idx = rule.rsplit('_', 1) rule_content = load_json_or_jsonl_with_idx(base_path, os.path.join( category, 'rule'), idx=rule_idx) rule_content_list.append(rule_content['rule_content']) for question in question_list: category, question_idx = question.rsplit('_', 1) question_content = load_json_or_jsonl_with_idx( base_path, os.path.join(category, 'sample'), idx=question_idx) question_content_list.append(question_content['question']) # Prepare prompt rules_str = '\n'.join( f'Rule {i+1}: {content}' for i, content in enumerate(rule_content_list)) questions_str = '\n'.join( f'Question {i+1}: {content}' for i, content in enumerate(question_content_list)) prompt_format = [rules_str, questions_str] prompt = template['prompt_format'][0].format(*prompt_format) # Add processed item data.append({ 'rule_list': rule_list, 'question_list': question_list, 'prompt': prompt, 'prompt_mode': 'mixed', 'answer': '', 'base_path': base_path, }) return Dataset.from_list(data) @ICL_EVALUATORS.register_module() class korbenchEvaluator(BaseEvaluator): def __init__(self): super().__init__() def sample_score(self, prediction, reference, test_item=None): """Evaluate a single sample. Args: prediction: The model's prediction reference: The reference answer test_item: Additional information about the test sample Returns: Dict: A dictionary containing evaluation results """ if test_item is None: raise ValueError('Test item is required.') prompt_mode = test_item.get('prompt_mode') # Build data for a single sample entry = { 'prediction': prediction, 'gold': reference, 'rule_id': test_item.get('rule_id', None), 'category': test_item.get('category', None), 'rule_list': test_item.get('rule_list', None), 'question_list': test_item.get('question_list', None), 'base_path': test_item.get('base_path', None), } # Evaluate the single sample data = {0: entry} # Evaluate based on different prompt_mode if prompt_mode == '0_shot': evaluation_results = evaluate_responses(data, '0_shot') elif prompt_mode == '3_shot': evaluation_results = evaluate_responses(data, '3_shot') elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']: evaluation_results = evaluate_responses(data, 'mixed', test_item.get('base_path')) else: return { 'is_correct': False, 'pred': prediction, 'answer': reference } # Return evaluation results result = evaluation_results[0] result['correct'] = result['is_correct'] result.update({'pred': prediction, 'answer': reference}) return result def score(self, predictions, references, test_set): """Evaluate each sample using sample_score.""" if not test_set: raise ValueError('Test set is empty.') details = [] correct_count = 0 # Call sample_score for each sample for i in range(len(predictions)): result = self.sample_score(predictions[i], references[i], test_set[i]) details.append(result) if result.get('is_correct', False): correct_count += 1 # Calculate accuracy accuracy = (correct_count / len(predictions)) * 100 if predictions else 0 # Return evaluation results return {'accuracy': accuracy, 'details': details}