From 046b6f75c6ee0ff2f583b30b6f39d73b52929f56 Mon Sep 17 00:00:00 2001 From: Junnan Liu Date: Thu, 20 Feb 2025 19:47:04 +0800 Subject: [PATCH] [Update] Update Greedy Config & README of LiveMathBench (#1862) * support omni-math * update config * upload README * Delete opencompass/configs/datasets/omni_math/__init__.py * update greedy config & README of LiveMathBench * update intro for max_out_len * rename livemathbench greedy confi * delete greedy config --------- Co-authored-by: liushz --- .../configs/datasets/livemathbench/README.md | 71 +++++++++---------- .../livemathbench/livemathbench_greedy_gen.py | 4 ++ ....py => livemathbench_greedy_gen_9befbf.py} | 10 +-- 3 files changed, 43 insertions(+), 42 deletions(-) create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py rename opencompass/configs/datasets/livemathbench/{livemathbench_greedy_gen_efb20d.py => livemathbench_greedy_gen_9befbf.py} (83%) diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md index 84490c94..24949f20 100644 --- a/opencompass/configs/datasets/livemathbench/README.md +++ b/opencompass/configs/datasets/livemathbench/README.md @@ -1,36 +1,30 @@ # LiveMathBench -## Details of Datsets +## v202412 + +### Details of Datsets | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | | -- | -- | -- | -- | -- | -- | -| AIMC | cn | 0 | 0 | 0 | 46 | -| AIMC | en | 0 | 0 | 0 | 46 | -| CEE | cn | 0 | 0 | 13 | 40 | -| CEE | en | 0 | 0 | 13 | 40 | -| CMO | cn | 0 | 0 | 0 | 18 | -| CMO | en | 0 | 0 | 0 | 18 | -| MATH500 | en | 0 | 0 | 0 | 500 | -| AIME2024 | en | 0 | 0 | 0 | 44 | +| AMC | cn | 0 | 0 | 0 | 46 | +| AMC | en | 0 | 0 | 0 | 46 | +| CCEE | cn | 0 | 0 | 13 | 31 | +| CCEE | en | 0 | 0 | 13 | 31 | +| CNMO | cn | 0 | 0 | 0 | 18 | +| CNMO | en | 0 | 0 | 0 | 18 | +| WLPMC | cn | 0 | 0 | 0 | 11 | +| WLPMC | en | 0 | 0 | 0 | 11 | -## How to use - +### How to use +#### G-Pass@k ```python from mmengine.config import read_base with read_base(): - from opencompass.datasets.livemathbench import livemathbench_datasets + from opencompass.datasets.livemathbench_gen import livemathbench_datasets -livemathbench_datasets[0].update( - { - 'abbr': 'livemathbench_${k}x${n}' - 'path': '/path/to/data/dir', - 'k': 'k@pass', # the max value of k in k@pass - 'n': 'number of runs', # number of runs - } -) livemathbench_datasets[0]['eval_cfg']['evaluator'].update( { 'model_name': 'Qwen/Qwen2.5-72B-Instruct', @@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update( ] # set url of evaluation models } ) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) ``` -> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested. - +#### Greedy ```python +from mmengine.config import read_base + +with read_base(): + from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets + livemathbench_datasets[0]['eval_cfg']['evaluator'].update( { 'model_name': 'Qwen/Qwen2.5-72B-Instruct', 'url': [ 'http://0.0.0.0:23333/v1', '...' - ], # set url of evaluation models - - # for LLM-based extraction - 'use_extract_model': True, - 'post_model_name': 'oc-extractor', - 'post_url': [ - 'http://0.0.0.0:21006/v1, - '...' - ] + ] # set url of evaluation models } ) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) + ``` -## Output Samples +### Output Samples | dataset | version | metric | mode | Qwen2.5-72B-Instruct | |----- | ----- | ----- | ----- | -----| -| LiveMathBench | caed8f | 1@pass | gen | 26.07 | -| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx | -| LiveMathBench | caed8f | 2@pass | gen | xx.xx | -| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx | -| LiveMathBench | caed8f | pass-rate | gen | xx.xx | +| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx | diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py new file mode 100644 index 00000000..d311eeaf --- /dev/null +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py similarity index 83% rename from opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py rename to opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py index d6acd7c0..d8d8b79c 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py @@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen livemathbench_dataset = dict( - abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name. type=LiveMathBenchDataset, - path='opencompass/LiveMathBench', + path='', k=1, replication=1, dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'], dataset_languages=['cn', 'en'], - cot=False, + cot=True, version='202412', + abbr='LiveMathBench-v202412', reader_cfg=dict( input_columns=['prompt'], output_column='answer' @@ -31,7 +31,7 @@ livemathbench_dataset = dict( retriever=dict(type=ZeroRetriever), inferencer=dict( type=GenInferencer, - max_out_len=16384, + max_out_len=8192 ), ), eval_cfg=dict( @@ -44,7 +44,7 @@ livemathbench_dataset = dict( extract_model_name='', k=[1], replication=1, - thresholds=[0.0, 0.25, 0.5, 0.75, 1.0] + thresholds=[0.0] ) ) )