[Update] Update LiveMathBench Evaluation to Support Single Dataset Split Metric Computation (#1730)

* upload dataset definitions & configs

* add single dataset split specific metrics

* add k-pass@threshold & MATH500
This commit is contained in:
Junnan Liu 2024-12-05 16:54:16 +08:00 committed by GitHub
parent 4f317d1bd5
commit 6181ac1122
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 92 additions and 15 deletions

View File

@ -4,12 +4,13 @@
| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
| -- | -- | -- | -- | -- | -- | | -- | -- | -- | -- | -- | -- |
| AIMC | cn | 46 | 0 | 0 | 0 | | AIMC | cn | 0 | 0 | 0 | 46 |
| AIMC | en | 46 | 0 | 0 | 0 | | AIMC | en | 0 | 0 | 0 | 46 |
| CEE | cn | 28 | 9 | 13 | 3 | | CEE | cn | 0 | 0 | 13 | 40 |
| CEE | en | 28 | 9 | 13 | 3 | | CEE | en | 0 | 0 | 13 | 40 |
| CMO | cn | 0 | 0 | 0 | 18 | | CMO | cn | 0 | 0 | 0 | 18 |
| CMO | en | 0 | 0 | 0 | 18 | | CMO | en | 0 | 0 | 0 | 18 |
| MATH500 | en | 0 | 0 | 0 | 500 |
## How to use ## How to use
@ -23,6 +24,7 @@ with read_base():
livemathbench_datasets[0].update( livemathbench_datasets[0].update(
{ {
'abbr': 'livemathbench_${k}x${n}'
'path': '/path/to/data/dir', 'path': '/path/to/data/dir',
'k': 'k@pass', # the max value of k in k@pass 'k': 'k@pass', # the max value of k in k@pass
'n': 'number of runs', # number of runs 'n': 'number of runs', # number of runs

View File

@ -20,7 +20,7 @@ from .prompts import (EXTRACT_PROMPT_CN, EXTRACT_PROMPT_EN, JUDGE_PROMPT_CN,
@LOAD_DATASET.register_module() @LOAD_DATASET.register_module()
class LiveMathBenchDataset(BaseDataset): class LiveMathBenchDataset(BaseDataset):
dataset_splits = ['AIMC', 'CEE', 'CMO'] dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500']
dataset_languages = ['cn', 'en'] dataset_languages = ['cn', 'en']
@staticmethod @staticmethod
@ -34,6 +34,8 @@ class LiveMathBenchDataset(BaseDataset):
for split, language in product(LiveMathBenchDataset.dataset_splits, for split, language in product(LiveMathBenchDataset.dataset_splits,
LiveMathBenchDataset.dataset_languages): LiveMathBenchDataset.dataset_languages):
file_path = os.path.join(path, f'{split}_{language}.jsonl') file_path = os.path.join(path, f'{split}_{language}.jsonl')
if not os.path.exists(file_path):
continue
dataset_info[f'{split}_{language}'] = { dataset_info[f'{split}_{language}'] = {
'single-choice': 0, 'single-choice': 0,
'multiple-choice': 0, 'multiple-choice': 0,
@ -270,12 +272,15 @@ class LiveMathBenchEvaluator(BaseEvaluator):
count = [] count = []
total_pass_num = [] total_pass_num = []
details = [] details = []
all_dataset = set()
for key, examples in key2example.items(): for key, examples in key2example.items():
detail = { detail = {
'question': examples[0][0]['question'], 'question': examples[0][0]['question'],
'answer': examples[0][0]['answer'], 'answer': examples[0][0]['answer'],
'responses': [] 'responses': [],
'dataset': '_'.join(key.split('_')[:-1])
} }
all_dataset.add('_'.join(key.split('_')[:-1]))
if_pass_list = [] if_pass_list = []
for single_run_examples in examples: for single_run_examples in examples:
detail['responses'].append([]) detail['responses'].append([])
@ -296,29 +301,99 @@ class LiveMathBenchEvaluator(BaseEvaluator):
i = 1 i = 1
while i <= K: while i <= K:
detail.update({ detail.update({
f'{i}@pass': f'pass-rate@{i}':
if_pass_list[:, :i].mean(axis=1).mean(axis=0).item(), if_pass_list[:, :i].mean(axis=1).mean(axis=0).item(),
f'{i}@pass/std': f'pass-rate@{i}/std':
if_pass_list[:, :i].mean(axis=1).std(axis=0).item() if_pass_list[:, :i].mean(axis=1).std(axis=0).item(),
f'pass@{i}':
if_pass_list[:, :1].mean(axis=1).mean(axis=0).item(),
f'pass@{i}/std':
if_pass_list[:, :1].mean(axis=1).std(axis=0).item(),
}) })
i = i * 2 i = i * 2
for threshold in [0.5, 0.75, 1.0]:
detail.update({
f'{K}-pass@{threshold}':
np.floor(
np.where(
if_pass_list.mean(axis=1) >= threshold, 1.0,
0.0).mean(axis=0))
})
count.append(np.ones_like(if_pass_list).sum(axis=1)) count.append(np.ones_like(if_pass_list).sum(axis=1))
total_pass_num.append(if_pass_list.sum(axis=1)) total_pass_num.append(if_pass_list.sum(axis=1))
details.append(detail) details.append(detail)
detailed_result = {'details': details} detailed_result = {'details': details}
i = 1 i = 1
while i <= K: while i <= K:
detailed_result.update({ detailed_result.update({
f'{i}@pass': f'pass-rate@{i}':
100. * np.mean([detail[f'{i}@pass'] for detail in details]), 100. *
f'{i}@pass/std': np.mean([detail[f'pass-rate@{i}'] for detail in details]),
100. * np.mean([detail[f'{i}@pass/std'] for detail in details]) f'pass-rate@{i}/std':
100. *
np.mean([detail[f'pass-rate@{i}/std'] for detail in details]),
f'pass@{i}':
100. * np.mean([detail[f'pass@{i}'] for detail in details]),
f'pass@{i}/std':
100. * np.mean([detail[f'pass@{i}/std'] for detail in details])
})
for d in sorted(list(all_dataset)):
detailed_result.update({
f'{d}/pass-rate@{i}':
100. * np.mean([
detail[f'pass-rate@{i}']
for detail in details if detail['dataset'] == d
]),
f'{d}/pass-rate@{i}/std':
100. * np.mean([
detail[f'pass-rate@{i}/std']
for detail in details if detail['dataset'] == d
]),
f'{d}/pass@{i}':
100. * np.mean([
detail[f'pass@{i}']
for detail in details if detail['dataset'] == d
]),
f'{d}/pass@{i}/std':
100. * np.mean([
detail[f'pass@{i}/std']
for detail in details if detail['dataset'] == d
])
}) })
i = i * 2 i = i * 2
detailed_result.update(
{'pass-rate': 100. * np.mean(sum(total_pass_num) / sum(count))}) for threshold in [0.5, 0.75, 1.0]:
detailed_result.update({
f'{K}-pass@{threshold}':
100. * np.mean([
detail[f'{K}-pass@{threshold}'] for detail in details
])
})
detailed_result.update({
f'{K}-pass@{threshold}/std':
100. * np.std([
detail[f'{K}-pass@{threshold}'] for detail in details
])
})
for d in sorted(list(all_dataset)):
detailed_result.update({
f'{d}/{K}-pass@{threshold}':
100. * np.mean([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})
detailed_result.update({
f'{d}/{K}-pass@{threshold}/std':
100. * np.std([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})
return detailed_result return detailed_result