diff --git a/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py b/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py index cbe6cf90..cf28a37c 100644 --- a/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py +++ b/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py @@ -1,4 +1,4 @@ -from opencompass.datasets import CARDBiomedBenchDataset, CARDBiomedBench_llmjudge_postprocess +from opencompass.datasets import CARDBiomedBenchDataset from opencompass.datasets import generic_llmjudge_postprocess from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate @@ -87,7 +87,7 @@ eval_cfg = dict( reader_cfg=reader_cfg, ), judge_cfg=dict(), - dict_postprocessor=dict(type=CARDBiomedBench_llmjudge_postprocess), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), ), ) cardbiomedbench_dataset = dict( diff --git a/opencompass/datasets/CARDBiomedBench.py b/opencompass/datasets/CARDBiomedBench.py index 39ac433c..3fbc116b 100644 --- a/opencompass/datasets/CARDBiomedBench.py +++ b/opencompass/datasets/CARDBiomedBench.py @@ -32,76 +32,3 @@ class CARDBiomedBenchDataset(BaseDataset): elif prompt_mode == 'few-shot': pass # TODO: Implement few-shot prompt return dataset - - -def _generic_llmjudge_postprocess(judgement: str): - match = re.search(r'(A|B)', judgement) - grade_letter = (match.group(0) if match else 'B' - ) # Default to "INCORRECT" if no match - return grade_letter - - -def CARDBiomedBench_llmjudge_postprocess( - output: dict, - output_path: str, - dataset: Dataset, -) -> dict: - # Get the original dataset - original_dataset = dataset.reader.dataset['test'] - - judged_answers = [] - original_responses = [] - references = [] - details = [] - - total_correct = 0 - total_count = 0 - - for k, v in output.items(): - idx = int(k) # Convert key to integer for indexing - original_responses.append(v['prediction']) - processed_judge = _generic_llmjudge_postprocess(v['prediction']) - - sample = original_dataset[idx] - # Record the judgment - if processed_judge is not None: - judged_answers.append(processed_judge) - try: - gold = v['gold'] - references.append(gold) - except KeyError: - get_logger().warning( - f'No gold answer for {k}, use empty string as reference!') - gold = '' - references.append('') - - # Check if the answer is correct (A means correct) - is_correct = processed_judge == 'A' - total_count += 1 - - if is_correct: - total_correct += 1 - - # Add to details - details.append({ - 'id': k, - 'question': sample['question'], - 'prediction': sample['prediction'], - 'origin_prompt': v['origin_prompt'], - 'llm_judge': processed_judge, - 'gold': gold, - 'is_correct': is_correct, - }) - - # Calculate overall accuracy with two decimal places - overall_accuracy = (round( - (total_correct / total_count * 100), 2) if total_count > 0 else 0.00) - - # Initialize results dictionary - results = { - 'accuracy': overall_accuracy, - 'total_correct': total_correct, - 'total_count': total_count, - 'details': details, - } - return results