diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md index fd506c0f..18f2cac1 100644 --- a/opencompass/configs/datasets/livemathbench/README.md +++ b/opencompass/configs/datasets/livemathbench/README.md @@ -4,12 +4,13 @@ | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | | -- | -- | -- | -- | -- | -- | -| AIMC | cn | 46 | 0 | 0 | 0 | -| AIMC | en | 46 | 0 | 0 | 0 | -| CEE | cn | 28 | 9 | 13 | 3 | -| CEE | en | 28 | 9 | 13 | 3 | +| AIMC | cn | 0 | 0 | 0 | 46 | +| AIMC | en | 0 | 0 | 0 | 46 | +| CEE | cn | 0 | 0 | 13 | 40 | +| CEE | en | 0 | 0 | 13 | 40 | | CMO | cn | 0 | 0 | 0 | 18 | | CMO | en | 0 | 0 | 0 | 18 | +| MATH500 | en | 0 | 0 | 0 | 500 | ## How to use @@ -23,6 +24,7 @@ with read_base(): livemathbench_datasets[0].update( { + 'abbr': 'livemathbench_${k}x${n}' 'path': '/path/to/data/dir', 'k': 'k@pass', # the max value of k in k@pass 'n': 'number of runs', # number of runs diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py index 0a4cf4ce..edcefb72 100644 --- a/opencompass/datasets/livemathbench/livemathbench.py +++ b/opencompass/datasets/livemathbench/livemathbench.py @@ -20,7 +20,7 @@ from .prompts import (EXTRACT_PROMPT_CN, EXTRACT_PROMPT_EN, JUDGE_PROMPT_CN, @LOAD_DATASET.register_module() class LiveMathBenchDataset(BaseDataset): - dataset_splits = ['AIMC', 'CEE', 'CMO'] + dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500'] dataset_languages = ['cn', 'en'] @staticmethod @@ -34,6 +34,8 @@ class LiveMathBenchDataset(BaseDataset): for split, language in product(LiveMathBenchDataset.dataset_splits, LiveMathBenchDataset.dataset_languages): file_path = os.path.join(path, f'{split}_{language}.jsonl') + if not os.path.exists(file_path): + continue dataset_info[f'{split}_{language}'] = { 'single-choice': 0, 'multiple-choice': 0, @@ -270,12 +272,15 @@ class LiveMathBenchEvaluator(BaseEvaluator): count = [] total_pass_num = [] details = [] + all_dataset = set() for key, examples in key2example.items(): detail = { 'question': examples[0][0]['question'], 'answer': examples[0][0]['answer'], - 'responses': [] + 'responses': [], + 'dataset': '_'.join(key.split('_')[:-1]) } + all_dataset.add('_'.join(key.split('_')[:-1])) if_pass_list = [] for single_run_examples in examples: detail['responses'].append([]) @@ -296,29 +301,99 @@ class LiveMathBenchEvaluator(BaseEvaluator): i = 1 while i <= K: detail.update({ - f'{i}@pass': + f'pass-rate@{i}': if_pass_list[:, :i].mean(axis=1).mean(axis=0).item(), - f'{i}@pass/std': - if_pass_list[:, :i].mean(axis=1).std(axis=0).item() + f'pass-rate@{i}/std': + if_pass_list[:, :i].mean(axis=1).std(axis=0).item(), + f'pass@{i}': + if_pass_list[:, :1].mean(axis=1).mean(axis=0).item(), + f'pass@{i}/std': + if_pass_list[:, :1].mean(axis=1).std(axis=0).item(), }) i = i * 2 + for threshold in [0.5, 0.75, 1.0]: + detail.update({ + f'{K}-pass@{threshold}': + np.floor( + np.where( + if_pass_list.mean(axis=1) >= threshold, 1.0, + 0.0).mean(axis=0)) + }) + count.append(np.ones_like(if_pass_list).sum(axis=1)) total_pass_num.append(if_pass_list.sum(axis=1)) details.append(detail) detailed_result = {'details': details} + i = 1 while i <= K: detailed_result.update({ - f'{i}@pass': - 100. * np.mean([detail[f'{i}@pass'] for detail in details]), - f'{i}@pass/std': - 100. * np.mean([detail[f'{i}@pass/std'] for detail in details]) + f'pass-rate@{i}': + 100. * + np.mean([detail[f'pass-rate@{i}'] for detail in details]), + f'pass-rate@{i}/std': + 100. * + np.mean([detail[f'pass-rate@{i}/std'] for detail in details]), + f'pass@{i}': + 100. * np.mean([detail[f'pass@{i}'] for detail in details]), + f'pass@{i}/std': + 100. * np.mean([detail[f'pass@{i}/std'] for detail in details]) }) + for d in sorted(list(all_dataset)): + detailed_result.update({ + f'{d}/pass-rate@{i}': + 100. * np.mean([ + detail[f'pass-rate@{i}'] + for detail in details if detail['dataset'] == d + ]), + f'{d}/pass-rate@{i}/std': + 100. * np.mean([ + detail[f'pass-rate@{i}/std'] + for detail in details if detail['dataset'] == d + ]), + f'{d}/pass@{i}': + 100. * np.mean([ + detail[f'pass@{i}'] + for detail in details if detail['dataset'] == d + ]), + f'{d}/pass@{i}/std': + 100. * np.mean([ + detail[f'pass@{i}/std'] + for detail in details if detail['dataset'] == d + ]) + }) i = i * 2 - detailed_result.update( - {'pass-rate': 100. * np.mean(sum(total_pass_num) / sum(count))}) + + for threshold in [0.5, 0.75, 1.0]: + detailed_result.update({ + f'{K}-pass@{threshold}': + 100. * np.mean([ + detail[f'{K}-pass@{threshold}'] for detail in details + ]) + }) + detailed_result.update({ + f'{K}-pass@{threshold}/std': + 100. * np.std([ + detail[f'{K}-pass@{threshold}'] for detail in details + ]) + }) + for d in sorted(list(all_dataset)): + detailed_result.update({ + f'{d}/{K}-pass@{threshold}': + 100. * np.mean([ + detail[f'{K}-pass@{threshold}'] + for detail in details if detail['dataset'] == d + ]) + }) + detailed_result.update({ + f'{d}/{K}-pass@{threshold}/std': + 100. * np.std([ + detail[f'{K}-pass@{threshold}'] + for detail in details if detail['dataset'] == d + ]) + }) return detailed_result