mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update Greedy Config & README of LiveMathBench (#1862)
* support omni-math * update config * upload README * Delete opencompass/configs/datasets/omni_math/__init__.py * update greedy config & README of LiveMathBench * update intro for max_out_len * rename livemathbench greedy confi * delete greedy config --------- Co-authored-by: liushz <qq1791167085@163.com>
This commit is contained in:
parent
d7daee6e25
commit
046b6f75c6
@ -1,36 +1,30 @@
|
||||
# LiveMathBench
|
||||
|
||||
## Details of Datsets
|
||||
## v202412
|
||||
|
||||
### Details of Datsets
|
||||
|
||||
| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
|
||||
| -- | -- | -- | -- | -- | -- |
|
||||
| AIMC | cn | 0 | 0 | 0 | 46 |
|
||||
| AIMC | en | 0 | 0 | 0 | 46 |
|
||||
| CEE | cn | 0 | 0 | 13 | 40 |
|
||||
| CEE | en | 0 | 0 | 13 | 40 |
|
||||
| CMO | cn | 0 | 0 | 0 | 18 |
|
||||
| CMO | en | 0 | 0 | 0 | 18 |
|
||||
| MATH500 | en | 0 | 0 | 0 | 500 |
|
||||
| AIME2024 | en | 0 | 0 | 0 | 44 |
|
||||
| AMC | cn | 0 | 0 | 0 | 46 |
|
||||
| AMC | en | 0 | 0 | 0 | 46 |
|
||||
| CCEE | cn | 0 | 0 | 13 | 31 |
|
||||
| CCEE | en | 0 | 0 | 13 | 31 |
|
||||
| CNMO | cn | 0 | 0 | 0 | 18 |
|
||||
| CNMO | en | 0 | 0 | 0 | 18 |
|
||||
| WLPMC | cn | 0 | 0 | 0 | 11 |
|
||||
| WLPMC | en | 0 | 0 | 0 | 11 |
|
||||
|
||||
|
||||
## How to use
|
||||
|
||||
### How to use
|
||||
|
||||
#### G-Pass@k
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.datasets.livemathbench import livemathbench_datasets
|
||||
from opencompass.datasets.livemathbench_gen import livemathbench_datasets
|
||||
|
||||
livemathbench_datasets[0].update(
|
||||
{
|
||||
'abbr': 'livemathbench_${k}x${n}'
|
||||
'path': '/path/to/data/dir',
|
||||
'k': 'k@pass', # the max value of k in k@pass
|
||||
'n': 'number of runs', # number of runs
|
||||
}
|
||||
)
|
||||
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
{
|
||||
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
||||
@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
] # set url of evaluation models
|
||||
}
|
||||
)
|
||||
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
||||
max_out_len=32768 # for o1-like models you need to update max_out_len
|
||||
))
|
||||
|
||||
```
|
||||
|
||||
> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested.
|
||||
|
||||
#### Greedy
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
|
||||
|
||||
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
{
|
||||
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
||||
'url': [
|
||||
'http://0.0.0.0:23333/v1',
|
||||
'...'
|
||||
], # set url of evaluation models
|
||||
|
||||
# for LLM-based extraction
|
||||
'use_extract_model': True,
|
||||
'post_model_name': 'oc-extractor',
|
||||
'post_url': [
|
||||
'http://0.0.0.0:21006/v1,
|
||||
'...'
|
||||
]
|
||||
] # set url of evaluation models
|
||||
}
|
||||
)
|
||||
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
||||
max_out_len=32768 # for o1-like models you need to update max_out_len
|
||||
))
|
||||
|
||||
```
|
||||
|
||||
## Output Samples
|
||||
### Output Samples
|
||||
|
||||
| dataset | version | metric | mode | Qwen2.5-72B-Instruct |
|
||||
|----- | ----- | ----- | ----- | -----|
|
||||
| LiveMathBench | caed8f | 1@pass | gen | 26.07 |
|
||||
| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | 2@pass | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | pass-rate | gen | xx.xx |
|
||||
| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |
|
||||
|
||||
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403
|
@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen
|
||||
|
||||
|
||||
livemathbench_dataset = dict(
|
||||
abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name.
|
||||
type=LiveMathBenchDataset,
|
||||
path='opencompass/LiveMathBench',
|
||||
path='',
|
||||
k=1,
|
||||
replication=1,
|
||||
dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
|
||||
dataset_languages=['cn', 'en'],
|
||||
cot=False,
|
||||
cot=True,
|
||||
version='202412',
|
||||
abbr='LiveMathBench-v202412',
|
||||
reader_cfg=dict(
|
||||
input_columns=['prompt'],
|
||||
output_column='answer'
|
||||
@ -31,7 +31,7 @@ livemathbench_dataset = dict(
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(
|
||||
type=GenInferencer,
|
||||
max_out_len=16384,
|
||||
max_out_len=8192
|
||||
),
|
||||
),
|
||||
eval_cfg=dict(
|
||||
@ -44,7 +44,7 @@ livemathbench_dataset = dict(
|
||||
extract_model_name='',
|
||||
k=[1],
|
||||
replication=1,
|
||||
thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
|
||||
thresholds=[0.0]
|
||||
)
|
||||
)
|
||||
)
|
Loading…
Reference in New Issue
Block a user