mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Compare commits
72 Commits
f9c4e27a7f
...
b504576fc1
Author | SHA1 | Date | |
---|---|---|---|
![]() |
b504576fc1 | ||
![]() |
d572761cef | ||
![]() |
408f5caff4 | ||
![]() |
6f3c670b99 | ||
![]() |
c3779ebfc1 | ||
![]() |
aa2b89b6f8 | ||
![]() |
7a7a4517ab | ||
![]() |
8c0ccf9a6b | ||
![]() |
6f3b6a5d12 | ||
![]() |
3d1760aba2 | ||
![]() |
b84518c656 | ||
![]() |
d60f59dcab | ||
![]() |
9eaa1f6fec | ||
![]() |
d590f557bb | ||
![]() |
c492e49e79 | ||
![]() |
2c79dc5227 | ||
![]() |
345674f700 | ||
![]() |
8aa18df368 | ||
![]() |
44a7024ed5 | ||
![]() |
508e2b0cb2 | ||
![]() |
7bdd3c1904 | ||
![]() |
6097186a95 | ||
![]() |
d72df59363 | ||
![]() |
c5048bfec7 | ||
![]() |
a7f3ac20b2 | ||
![]() |
ff3275edf0 | ||
![]() |
a685ed7daf | ||
![]() |
9ec23c145b | ||
![]() |
ba0e32292c | ||
![]() |
43b2c4ed76 | ||
![]() |
d62b69aaef | ||
![]() |
af8432e1d6 | ||
![]() |
ddc9cc0afb | ||
![]() |
37cbaf8d92 | ||
![]() |
b6148aa198 | ||
![]() |
527a80947b | ||
![]() |
8c74e6a39e | ||
![]() |
e8bc8c1e8c | ||
![]() |
97010dc4ce | ||
![]() |
dcbf899369 | ||
![]() |
bf74f26603 | ||
![]() |
455bb05d1b | ||
![]() |
c69110361b | ||
![]() |
a2093a81ef | ||
![]() |
b2da1c08a8 | ||
![]() |
65ff602cf5 | ||
![]() |
75e7834b59 | ||
![]() |
6a6a1a5c0b | ||
![]() |
3f50b1dc49 | ||
![]() |
20660ab507 | ||
![]() |
12213207b6 | ||
![]() |
6ac9b06bc2 | ||
![]() |
a05f9da134 | ||
![]() |
fd82bea747 | ||
![]() |
bb58cfc85d | ||
![]() |
b564e608b1 | ||
![]() |
828fb745c9 | ||
![]() |
f982d6278e | ||
![]() |
3a9a384173 | ||
![]() |
9b489e9ea0 | ||
![]() |
dc8deb6af0 | ||
![]() |
32d6859679 | ||
![]() |
97236c8e97 | ||
![]() |
f66b0b347a | ||
![]() |
330a6e5ca7 | ||
![]() |
f71eb78c72 | ||
![]() |
0f46c35211 | ||
![]() |
6118596362 | ||
![]() |
56fc5748d8 | ||
![]() |
8ea13bde6a | ||
![]() |
c6c4ffc180 | ||
![]() |
707ef2fef9 |
6
.github/scripts/eval_regression_api.py
vendored
6
.github/scripts/eval_regression_api.py
vendored
@ -24,9 +24,9 @@ models = [
|
||||
abbr='lmdeploy-api-test',
|
||||
type=OpenAISDK,
|
||||
key='EMPTY',
|
||||
openai_api_base='http://0.0.0.0:23333/v1',
|
||||
path='internlm2',
|
||||
tokenizer_path='internlm/internlm2_5-7b-chat',
|
||||
openai_api_base='http://localhost:23333/v1',
|
||||
path='internlm3',
|
||||
tokenizer_path='internlm/internlm3-8b-instruct',
|
||||
rpm_verbose=True,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=128,
|
||||
|
18
.github/scripts/eval_regression_base_models.py
vendored
18
.github/scripts/eval_regression_base_models.py
vendored
@ -11,18 +11,10 @@ with read_base():
|
||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||
winogrande_datasets # noqa: F401, E501
|
||||
# read hf models - chat models
|
||||
from opencompass.configs.models.chatglm.hf_glm4_9b import \
|
||||
models as hf_glm4_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
||||
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
||||
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
|
||||
models as hf_deepseek_67b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
|
||||
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
||||
@ -49,12 +41,6 @@ with read_base():
|
||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||
models as hf_internlm2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
|
||||
models as hf_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
|
||||
models as hf_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
|
||||
models as hf_internlm2_base_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
||||
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||
@ -65,14 +51,14 @@ with read_base():
|
||||
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
|
||||
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||
models as hf_llama2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
||||
models as hf_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||
models as hf_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
|
||||
models as hf_llama3_70b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||
|
41
.github/scripts/eval_regression_chat_models.py
vendored
41
.github/scripts/eval_regression_chat_models.py
vendored
@ -15,14 +15,24 @@ with read_base():
|
||||
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
||||
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
|
||||
models as hf_deepseek_67b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
|
||||
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
|
||||
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
|
||||
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
|
||||
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
|
||||
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
||||
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
||||
@ -45,6 +55,8 @@ with read_base():
|
||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
||||
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
|
||||
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
||||
@ -57,6 +69,8 @@ with read_base():
|
||||
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
|
||||
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
|
||||
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
||||
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||
@ -83,10 +97,6 @@ with read_base():
|
||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
|
||||
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||
models as \
|
||||
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
||||
@ -95,14 +105,19 @@ with read_base():
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||
models as \
|
||||
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as \
|
||||
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
|
||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_4 import \
|
||||
models as hf_phi_4_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
||||
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
||||
@ -142,6 +157,8 @@ with read_base():
|
||||
|
||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||
|
||||
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||
|
||||
race_datasets = [race_datasets[1]]
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
|
40
.github/scripts/oc_score_assert.py
vendored
40
.github/scripts/oc_score_assert.py
vendored
@ -175,10 +175,11 @@ class TestApibench:
|
||||
class TestVolcFullbench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
||||
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
||||
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
||||
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
||||
] for p2 in dataset_list(p1, 'objective')])
|
||||
@pytest.mark.chat_objective
|
||||
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
||||
model, dataset):
|
||||
@ -245,10 +246,7 @@ class TestCmdCase:
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
|
||||
('internlm2-1.8b-hf', 'race-middle_accuracy'),
|
||||
('internlm2-1.8b-hf', 'race-high_accuracy'),
|
||||
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
|
||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
@ -260,9 +258,9 @@ class TestCmdCase:
|
||||
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
@ -280,13 +278,25 @@ class TestCmdCase:
|
||||
|
||||
@pytest.mark.case4
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
|
||||
'model, dataset',
|
||||
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case5
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
|
||||
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||
|
27
.github/scripts/oc_score_baseline.yaml
vendored
27
.github/scripts/oc_score_baseline.yaml
vendored
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
|
||||
race-middle_accuracy: 91.78
|
||||
race-high_accuracy: 90.02
|
||||
|
||||
internlm2-1.8b-hf:
|
||||
demo_gsm8k_accuracy: 15.62
|
||||
race-middle_accuracy: 71.66
|
||||
race-high_accuracy: 66.38
|
||||
|
||||
internlm2_5-7b-chat-lmdeploy:
|
||||
demo_gsm8k_accuracy: 89.06
|
||||
demo_gsm8k_accuracy: 84.38
|
||||
race-middle_accuracy: 92.76
|
||||
race-high_accuracy: 90.54
|
||||
|
||||
internlm2-chat-1.8b-lmdeploy:
|
||||
demo_gsm8k_accuracy: 31
|
||||
race-middle_accuracy: 81.34
|
||||
race-high_accuracy: 73.96
|
||||
internlm3-8b-instruct-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-vllm:
|
||||
demo_gsm8k_accuracy: 78.12
|
||||
race-middle_accuracy: 92.20
|
||||
race-high_accuracy: 89.88
|
||||
|
||||
internlm2_5-7b-chat_hf:
|
||||
demo_gsm8k_accuracy: 87.50
|
||||
@ -30,5 +35,5 @@ internlm2_5-7b-chat_hf:
|
||||
|
||||
lmdeploy-api-test:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-middle_accuracy: 87.50
|
||||
race-middle_accuracy: 93.75
|
||||
race-high_accuracy: 93.75
|
||||
|
727
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
727
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
drop_accuracy: 81.25
|
||||
GPQA_diamond_accuracy: 25
|
||||
hellaswag_accuracy: 87.5
|
||||
TheoremQA_score: 18.75
|
||||
TheoremQA_score: 12.50
|
||||
musr_average_naive_average: 39.58
|
||||
korbench_single_naive_average: 40
|
||||
gsm8k_accuracy: 62.50
|
||||
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
lcb_test_output_pass@1: 18.75
|
||||
bbh-logical_deduction_seven_objects_score: 50
|
||||
bbh-multistep_arithmetic_two_score: 68.75
|
||||
mmlu-other_naive_average: 72.6
|
||||
cmmlu-china-specific_naive_average: 76.25
|
||||
mmlu-other_accuracy: 72.6
|
||||
cmmlu-china-specific_accuracy: 76.25
|
||||
mmlu_pro_math_accuracy: 25
|
||||
ds1000_Pandas_accuracy: 12.5
|
||||
ds1000_Numpy_accuracy: 0
|
||||
@ -39,14 +39,14 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
college_knowledge_naive_average: 87.5
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 0.66
|
||||
alpaca_eval_total: 20
|
||||
arenahard_score: 50
|
||||
alpaca_eval_total: 20.00
|
||||
arenahard_score: 56.82
|
||||
Followbench_naive_average: 1
|
||||
CompassArena_naive_average: 43
|
||||
mtbench101_avg: 7.8
|
||||
wildbench_average: -12.78
|
||||
simpleqa_accuracy_given_attempted: 0
|
||||
chinese_simpleqa_given_attempted_accuracy: 1
|
||||
mtbench101_avg: 7.60
|
||||
wildbench_average: -14.58
|
||||
simpleqa_accuracy_given_attempted: 1.00
|
||||
chinese_simpleqa_given_attempted_accuracy: 0.90
|
||||
alignment_bench_v1_1_专业能力: 7.90
|
||||
alignment_bench_v1_1_数学计算: 0
|
||||
alignment_bench_v1_1_基本任务: 0
|
||||
@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
alignment_bench_v1_1_文本写作: 0
|
||||
alignment_bench_v1_1_角色扮演: 0
|
||||
alignment_bench_v1_1_综合问答: 0
|
||||
alpaca_eval_helpful_base: 20
|
||||
alpaca_eval_helpful_base: 20.00
|
||||
compassarena_language_naive_average: 35
|
||||
compassarena_knowledge_naive_average: 55
|
||||
compassarena_knowledge_naive_average: 60.00
|
||||
compassarena_reason_v2_naive_average: 40
|
||||
compassarena_math_v2_naive_average: 55
|
||||
compassarena_math_v2_naive_average: 50.00
|
||||
compassarena_creationv2_zh_naive_average: 30
|
||||
followbench_llmeval_en_HSR_AVG: 1
|
||||
followbench_llmeval_en_SSR_AVG: 1
|
||||
@ -73,58 +73,58 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
followbench_llmeval_en_SSR_L3: 1
|
||||
followbench_llmeval_en_SSR_L4: 1
|
||||
followbench_llmeval_en_SSR_L5: 1
|
||||
simpleqa_f1: 0
|
||||
simpleqa_f1: 0.12
|
||||
|
||||
internlm2_5-7b-chat-turbomind_fullbench:
|
||||
objective:
|
||||
race-high_accuracy: 93.75
|
||||
ARC-c_accuracy: 93.75
|
||||
BoolQ_accuracy: 68.75
|
||||
BoolQ_accuracy: 75.00
|
||||
triviaqa_wiki_1shot_score: 50
|
||||
nq_open_1shot_score: 25
|
||||
IFEval_Prompt-level-strict-accuracy: 56.25
|
||||
drop_accuracy: 81.25
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
drop_accuracy: 75
|
||||
GPQA_diamond_accuracy: 37.50
|
||||
hellaswag_accuracy: 81.25
|
||||
TheoremQA_score: 6.25
|
||||
TheoremQA_score: 12.5
|
||||
musr_average_naive_average: 39.58
|
||||
korbench_single_naive_average: 37.50
|
||||
korbench_single_naive_average: 40
|
||||
gsm8k_accuracy: 68.75
|
||||
math_accuracy: 68.75
|
||||
cmo_fib_accuracy: 6.25
|
||||
aime2024_accuracy: 6.25
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||
sanitized_mbpp_score: 68.75
|
||||
ds1000_naive_average: 16.96
|
||||
ds1000_naive_average: 15.18
|
||||
lcb_code_generation_pass@1: 12.5
|
||||
lcb_code_execution_pass@1: 43.75
|
||||
lcb_test_output_pass@1: 25.00
|
||||
bbh-logical_deduction_seven_objects_score: 50.00
|
||||
bbh-multistep_arithmetic_two_score: 68.75
|
||||
mmlu-other_naive_average: 69.71
|
||||
cmmlu-china-specific_naive_average: 75.83
|
||||
mmlu_pro_math_accuracy: 31.25
|
||||
ds1000_Pandas_accuracy: 0
|
||||
lcb_test_output_pass@1: 0.00
|
||||
bbh-logical_deduction_seven_objects_score: 62.50
|
||||
bbh-multistep_arithmetic_two_score: 62.50
|
||||
mmlu-other_accuracy: 73.08
|
||||
cmmlu-china-specific_accuracy: 75.42
|
||||
mmlu_pro_math_accuracy: 25.00
|
||||
ds1000_Pandas_accuracy: 0.00
|
||||
ds1000_Numpy_accuracy: 0
|
||||
ds1000_Tensorflow_accuracy: 12.5
|
||||
ds1000_Scipy_accuracy: 18.75
|
||||
ds1000_Sklearn_accuracy: 18.75
|
||||
ds1000_Pytorch_accuracy: 18.75
|
||||
ds1000_Matplotlib_accuracy: 50.00
|
||||
ds1000_Pytorch_accuracy: 12.50
|
||||
ds1000_Matplotlib_accuracy: 43.75
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||
college_naive_average: 12.50
|
||||
college_knowledge_naive_average: 87.5
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 0.70
|
||||
alpaca_eval_total: 0
|
||||
arenahard_score: 50
|
||||
alignment_bench_v1_1_总分: 0.72
|
||||
alpaca_eval_total: 20.00
|
||||
arenahard_score: 55.77
|
||||
Followbench_naive_average: 1
|
||||
CompassArena_naive_average: 38
|
||||
mtbench101_avg: 7.80
|
||||
wildbench_average: -4.86
|
||||
simpleqa_accuracy_given_attempted: 0
|
||||
CompassArena_naive_average: 39.00
|
||||
mtbench101_avg: 7.90
|
||||
wildbench_average: 0.00
|
||||
simpleqa_accuracy_given_attempted: 1.00
|
||||
chinese_simpleqa_given_attempted_accuracy: 1
|
||||
alignment_bench_v1_1_专业能力: 8.4
|
||||
alignment_bench_v1_1_专业能力: 8.70
|
||||
alignment_bench_v1_1_数学计算: 0
|
||||
alignment_bench_v1_1_基本任务: 0
|
||||
alignment_bench_v1_1_逻辑推理: 0
|
||||
@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
||||
alignment_bench_v1_1_文本写作: 0
|
||||
alignment_bench_v1_1_角色扮演: 0
|
||||
alignment_bench_v1_1_综合问答: 0
|
||||
alpaca_eval_helpful_base: 0
|
||||
compassarena_language_naive_average: 35
|
||||
compassarena_knowledge_naive_average: 50
|
||||
compassarena_reason_v2_naive_average: 30
|
||||
compassarena_math_v2_naive_average: 50
|
||||
compassarena_creationv2_zh_naive_average: 25
|
||||
alpaca_eval_helpful_base: 20.00
|
||||
compassarena_language_naive_average: 25.00
|
||||
compassarena_knowledge_naive_average: 55.00
|
||||
compassarena_reason_v2_naive_average: 35.00
|
||||
compassarena_math_v2_naive_average: 55.00
|
||||
compassarena_creationv2_zh_naive_average: 25.00
|
||||
followbench_llmeval_en_HSR_AVG: 1
|
||||
followbench_llmeval_en_SSR_AVG: 1
|
||||
followbench_llmeval_en_HSR_L1: 1
|
||||
@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
||||
followbench_llmeval_en_SSR_L3: 1
|
||||
followbench_llmeval_en_SSR_L4: 1
|
||||
followbench_llmeval_en_SSR_L5: 1
|
||||
simpleqa_f1: 0
|
||||
simpleqa_f1: 0.12
|
||||
|
||||
internlm2_5-7b-hf_fullbench:
|
||||
objective:
|
||||
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
|
||||
drop_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 62.5
|
||||
hellaswag_accuracy: 93.75
|
||||
TheoremQA_score: 25
|
||||
TheoremQA_score: 18.75
|
||||
winogrande_accuracy: 75
|
||||
gsm8k_accuracy: 37.5
|
||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
|
||||
@ -188,23 +188,23 @@ internlm2_5-7b-turbomind_fullbench:
|
||||
triviaqa_wiki_1shot_score: 43.75
|
||||
nq_open_1shot_score: 43.75
|
||||
drop_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 68.75
|
||||
hellaswag_accuracy: 93.75
|
||||
TheoremQA_score: 25.00
|
||||
TheoremQA_score: 18.75
|
||||
winogrande_accuracy: 87.5
|
||||
gsm8k_accuracy: 62.50
|
||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
|
||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
|
||||
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
||||
math_accuracy: 18.75
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||
math_accuracy: 6.25
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 0.00
|
||||
sanitized_mbpp_score: 62.50
|
||||
dingo_en_192_score: 31.25
|
||||
dingo_zh_170_score: 93.75
|
||||
mmlu-other_accuracy: 76.92
|
||||
cmmlu-china-specific_accuracy: 84.17
|
||||
dingo_en_192_score: 37.50
|
||||
dingo_zh_170_score: 100.00
|
||||
mmlu-other_accuracy: 78.37
|
||||
cmmlu-china-specific_accuracy: 83.33
|
||||
mmlu_pro_math_accuracy: 18.75
|
||||
bbh-logical_deduction_seven_objects_score: 50
|
||||
bbh-multistep_arithmetic_two_score: 56.25
|
||||
bbh-logical_deduction_seven_objects_score: 62.50
|
||||
bbh-multistep_arithmetic_two_score: 50.00
|
||||
college_naive_average: 12.5
|
||||
college_knowledge_naive_average: 87.5
|
||||
|
||||
@ -230,19 +230,19 @@ internlm2_5-7b-turbomind:
|
||||
mmlu_naive_average: 71.44
|
||||
mmlu_pro_naive_average: 38.18
|
||||
openai_humaneval_humaneval_pass@1: 59.76
|
||||
openai_humaneval_v2_humaneval_pass@1: 51.22
|
||||
openai_humaneval_v2_humaneval_pass@1: 57.93
|
||||
sanitized_mbpp_score: 55.25
|
||||
dingo_en_192_score: 60.94
|
||||
dingo_zh_170_score: 67.65
|
||||
mmlu-stem_naive_average: 63.72
|
||||
mmlu-social-science_naive_average: 80.15
|
||||
mmlu-humanities_naive_average: 74.27
|
||||
mmlu-other_naive_average: 71.85
|
||||
cmmlu-stem_naive_average: 67.07
|
||||
cmmlu-social-science_naive_average: 81.49
|
||||
cmmlu-humanities_naive_average: 85.84
|
||||
cmmlu-other_naive_average: 82.69
|
||||
cmmlu-china-specific_naive_average: 79.88
|
||||
mmlu-stem_accuracy: 63.72
|
||||
mmlu-social-science_accuracy: 80.15
|
||||
mmlu-humanities_accuracy: 74.27
|
||||
mmlu-other_accuracy: 71.85
|
||||
cmmlu-stem_accuracy: 67.07
|
||||
cmmlu-social-science_accuracy: 81.49
|
||||
cmmlu-humanities_accuracy: 85.84
|
||||
cmmlu-other_accuracy: 82.69
|
||||
cmmlu-china-specific_accuracy: 79.88
|
||||
mmlu_pro_biology_accuracy: 58.58
|
||||
mmlu_pro_business_accuracy: 28.01
|
||||
mmlu_pro_chemistry_accuracy: 22.79
|
||||
@ -257,17 +257,17 @@ internlm2_5-7b-turbomind:
|
||||
mmlu_pro_physics_accuracy: 26.02
|
||||
mmlu_pro_psychology_accuracy: 52.76
|
||||
mmlu_pro_other_accuracy: 42.21
|
||||
college_naive_average: 10.67
|
||||
college_naive_average: 7.00
|
||||
high_naive_average: 6.67
|
||||
middle_naive_average: 26.67
|
||||
primary_naive_average: 60
|
||||
primary_naive_average: 64.00
|
||||
arithmetic_naive_average: 55
|
||||
mathbench-a (average)_naive_average: 31.8
|
||||
college_knowledge_naive_average: 62.34
|
||||
high_knowledge_naive_average: 59.83
|
||||
college_knowledge_naive_average: 58.23
|
||||
high_knowledge_naive_average: 52.51
|
||||
middle_knowledge_naive_average: 71.15
|
||||
primary_knowledge_naive_average: 66.55
|
||||
mathbench-t (average)_naive_average: 64.97
|
||||
primary_knowledge_naive_average: 60.48
|
||||
mathbench-t (average)_naive_average: 60.19
|
||||
long_context:
|
||||
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
|
||||
Single-Needle-Retrieval-EN-32000_naive_average: 100
|
||||
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
|
||||
longbench_naive_average: 46.19
|
||||
longbench_zh_naive_average: 49.3
|
||||
longbench_en_naive_average: 43.97
|
||||
longbench_single-document-qa_naive_average: 42.84
|
||||
longbench_multi-document-qa_naive_average: 37.29
|
||||
longbench_summarization_naive_average: 23.21
|
||||
longbench_few-shot-learning_naive_average: 61.67
|
||||
longbench_synthetic-tasks_naive_average: 60.05
|
||||
longbench_code-completion_naive_average: 52.09
|
||||
longbench_single-document-qa_score: 42.84
|
||||
longbench_multi-document-qa_score: 41.25
|
||||
longbench_summarization_score: 23.21
|
||||
longbench_few-shot-learning_score: 61.67
|
||||
longbench_synthetic-tasks_score: 60.05
|
||||
longbench_code-completion_score: 52.09
|
||||
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
objective:
|
||||
@ -309,7 +309,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
GaokaoBench_weighted_average: 78.6
|
||||
math_accuracy: 61
|
||||
cmo_fib_accuracy: 11
|
||||
aime2024_accuracy: 6.67
|
||||
aime2024_accuracy: 3.33
|
||||
Mathbench_naive_average: 64.23
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.32
|
||||
cmmlu_naive_average: 74.3
|
||||
@ -322,20 +322,20 @@ internlm2_5-7b-chat-turbomind:
|
||||
lcb_code_generation_pass@1: 17.75
|
||||
lcb_code_execution_pass@1: 32.57
|
||||
lcb_test_output_pass@1: 26.13
|
||||
bigcodebench_hard_instruct_pass@1: 8.45
|
||||
bigcodebench_hard_instruct_pass@1: 3.38
|
||||
bigcodebench_hard_complete_pass@1: 5.06
|
||||
teval_naive_average: 80
|
||||
SciCode_sub_accuracy: 5.56
|
||||
qa_dingo_cn_score: 99.01
|
||||
mmlu-stem_naive_average: 68.2
|
||||
mmlu-social-science_naive_average: 75.8
|
||||
mmlu-humanities_naive_average: 69.3
|
||||
mmlu-other_naive_average: 71.3
|
||||
cmmlu-stem_naive_average: 66.64
|
||||
cmmlu-social-science_naive_average: 76
|
||||
cmmlu-humanities_naive_average: 77.9
|
||||
cmmlu-other_naive_average: 77.25
|
||||
cmmlu-china-specific_naive_average: 73.6
|
||||
mmlu-stem_accuracy: 68.2
|
||||
mmlu-social-science_accuracy: 75.8
|
||||
mmlu-humanities_accuracy: 69.3
|
||||
mmlu-other_accuracy: 71.3
|
||||
cmmlu-stem_accuracy: 66.64
|
||||
cmmlu-social-science_accuracy: 76
|
||||
cmmlu-humanities_accuracy: 77.9
|
||||
cmmlu-other_accuracy: 77.25
|
||||
cmmlu-china-specific_accuracy: 73.6
|
||||
mmlu_pro_biology_accuracy: 66.67
|
||||
mmlu_pro_business_accuracy: 47.91
|
||||
mmlu_pro_chemistry_accuracy: 35
|
||||
@ -384,14 +384,14 @@ internlm2_5-7b-chat-turbomind:
|
||||
college_knowledge_naive_average: 67.1
|
||||
high_knowledge_naive_average: 70
|
||||
middle_knowledge_naive_average: 80
|
||||
primary_knowledge_naive_average: 87
|
||||
primary_knowledge_naive_average: 90.12
|
||||
mathbench-t (average)_naive_average: 76
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 5.68
|
||||
alpaca_eval_total: 25.96
|
||||
arenahard_score: 17.15
|
||||
Followbench_naive_average: 0.81
|
||||
CompassArena_naive_average: 34.61
|
||||
CompassArena_naive_average: 39.49
|
||||
FoFo_naive_average: 0.38
|
||||
mtbench101_avg: 8.01
|
||||
wildbench_average: -10.49
|
||||
@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
|
||||
alpaca_eval_koala: 28.21
|
||||
alpaca_eval_oasst: 23.4
|
||||
alpaca_eval_selfinstruct: 30.95
|
||||
alpaca_eval_vicuna: 25
|
||||
compassarena_language_naive_average: 52.5
|
||||
alpaca_eval_vicuna: 25.00
|
||||
compassarena_language_naive_average: 53.00
|
||||
compassarena_knowledge_naive_average: 36
|
||||
compassarena_reason_v2_naive_average: 35
|
||||
compassarena_math_v2_naive_average: 19.91
|
||||
compassarena_math_v2_naive_average: 16.07
|
||||
compassarena_creationv2_zh_naive_average: 43.64
|
||||
fofo_test_prompts_overall: 0.35
|
||||
fofo_test_prompts_cn_overall: 0.41
|
||||
@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
|
||||
babilong_32k_naive_average: 48.9
|
||||
babilong_128k_naive_average: 40.8
|
||||
babilong_256k_naive_average: 23.5
|
||||
longbench_single-document-qa_naive_average: 43.56
|
||||
longbench_multi-document-qa_naive_average: 46.24
|
||||
longbench_summarization_naive_average: 24.32
|
||||
longbench_few-shot-learning_naive_average: 51.67
|
||||
longbench_synthetic-tasks_naive_average: 66.83
|
||||
longbench_code-completion_naive_average: 45.99
|
||||
longbench_single-document-qa_score: 43.56
|
||||
longbench_multi-document-qa_score: 46.24
|
||||
longbench_summarization_score: 24.32
|
||||
longbench_few-shot-learning_score: 51.67
|
||||
longbench_synthetic-tasks_score: 66.83
|
||||
longbench_code-completion_score: 45.99
|
||||
|
||||
|
||||
qwen2.5-7b-instruct-turbomind:
|
||||
objective:
|
||||
race-high_accuracy: 84.99
|
||||
ARC-c_accuracy: 92.2
|
||||
BoolQ_accuracy: 86.7
|
||||
triviaqa_wiki_1shot_score: 53.06
|
||||
nq_open_1shot_score: 17.51
|
||||
mmmlu_lite_naive_average: 54.96
|
||||
IFEval_Prompt-level-strict-accuracy: 71.53
|
||||
drop_accuracy: 80.07
|
||||
bbh_naive_average: 68.81
|
||||
GPQA_diamond_accuracy: 34.34
|
||||
hellaswag_accuracy: 85.42
|
||||
TheoremQA_score: 18.38
|
||||
musr_average_naive_average: 43.44
|
||||
korbench_single_naive_average: 39.44
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||
gsm8k_accuracy: 92.57
|
||||
GaokaoBench_weighted_average: 80.14
|
||||
math_accuracy: 73.58
|
||||
cmo_fib_accuracy: 25
|
||||
aime2024_accuracy: 16.67
|
||||
Mathbench_naive_average: 77.33
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
|
||||
cmmlu_naive_average: 75.97
|
||||
mmlu_naive_average: 76.01
|
||||
mmlu_pro_naive_average: 56.12
|
||||
openai_humaneval_humaneval_pass@1: 83.54
|
||||
sanitized_mbpp_score: 74.71
|
||||
humanevalx_naive_average: 48.29
|
||||
ds1000_naive_average: 18.66
|
||||
lcb_code_generation_pass@1: 39.5
|
||||
lcb_code_execution_pass@1: 42.38
|
||||
lcb_test_output_pass@1: 50.68
|
||||
bigcodebench_hard_instruct_pass@1: 16.22
|
||||
bigcodebench_hard_complete_pass@1: 11.49
|
||||
teval_naive_average: 79.72
|
||||
SciCode_sub_accuracy: 10.76
|
||||
qa_dingo_cn_score: 99.01
|
||||
mmlu_accuracy: 76.01
|
||||
mmlu-stem_accuracy: 77.59
|
||||
mmlu-social-science_accuracy: 79.02
|
||||
mmlu-humanities_accuracy: 72.07
|
||||
mmlu-other_accuracy: 74.86
|
||||
cmmlu_accuracy: 75.97
|
||||
cmmlu-stem_accuracy: 73.09
|
||||
cmmlu-social-science_accuracy: 75.95
|
||||
cmmlu-humanities_accuracy: 76.53
|
||||
cmmlu-other_accuracy: 78.79
|
||||
cmmlu-china-specific_accuracy: 73.17
|
||||
mmlu_pro_accuracy: 56.12
|
||||
mmlu_pro_biology_accuracy: 71.41
|
||||
mmlu_pro_business_accuracy: 67.68
|
||||
mmlu_pro_chemistry_accuracy: 54.59
|
||||
mmlu_pro_computer_science_accuracy: 58.29
|
||||
mmlu_pro_economics_accuracy: 66.82
|
||||
mmlu_pro_engineering_accuracy: 42.41
|
||||
mmlu_pro_health_accuracy: 55.87
|
||||
mmlu_pro_history_accuracy: 46.46
|
||||
mmlu_pro_law_accuracy: 28.97
|
||||
mmlu_pro_math_accuracy: 73.13
|
||||
mmlu_pro_philosophy_accuracy: 44.89
|
||||
mmlu_pro_physics_accuracy: 58.43
|
||||
mmlu_pro_psychology_accuracy: 63.16
|
||||
mmlu_pro_other_accuracy: 53.57
|
||||
humanevalx-python_pass@1: 50
|
||||
humanevalx-cpp_pass@1: 42.07
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 53.05
|
||||
humanevalx-js_pass@1: 75
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 8.18
|
||||
ds1000_Tensorflow_accuracy: 17.78
|
||||
ds1000_Scipy_accuracy: 15.09
|
||||
ds1000_Sklearn_accuracy: 10.43
|
||||
ds1000_Pytorch_accuracy: 4.41
|
||||
ds1000_Matplotlib_accuracy: 60.65
|
||||
mmmlu_lite_accuracy: 54.96
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 42.25
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 59.93
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 66.53
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 49.26
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 65.47
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 61.54
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 60.28
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 55.51
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||
college_naive_average: 44.33
|
||||
high_naive_average: 59
|
||||
middle_naive_average: 78
|
||||
primary_naive_average: 85.67
|
||||
arithmetic_naive_average: 75.67
|
||||
mathbench-a (average)_naive_average: 69.27
|
||||
college_knowledge_naive_average: 83.86
|
||||
high_knowledge_naive_average: 80.29
|
||||
middle_knowledge_naive_average: 84.26
|
||||
primary_knowledge_naive_average: 93.16
|
||||
mathbench-t (average)_naive_average: 85.39
|
||||
|
||||
|
||||
|
||||
|
||||
internlm2_5-7b-chat-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 86.39
|
||||
ARC-c_accuracy: 90.51
|
||||
BoolQ_accuracy: 88.01
|
||||
triviaqa_wiki_1shot_score: 64.77
|
||||
nq_open_1shot_score: 22.71
|
||||
mmmlu_lite_naive_average: 45.02
|
||||
IFEval_Prompt-level-strict-accuracy: 56.56
|
||||
drop_accuracy: 75.46
|
||||
bbh_naive_average: 73.34
|
||||
GPQA_diamond_accuracy: 32.83
|
||||
hellaswag_accuracy: 94.81
|
||||
TheoremQA_score: 23.88
|
||||
musr_average_naive_average: 51.31
|
||||
korbench_single_naive_average: 32
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||
gsm8k_accuracy: 86.96
|
||||
GaokaoBench_weighted_average: 78.05
|
||||
math_accuracy: 60.34
|
||||
cmo_fib_accuracy: 12.98
|
||||
aime2024_accuracy: 3.33
|
||||
Mathbench_naive_average: 64.82
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
|
||||
cmmlu_naive_average: 74.24
|
||||
mmlu_naive_average: 70.2
|
||||
mmlu_pro_naive_average: 45.39
|
||||
openai_humaneval_humaneval_pass@1: 70.12
|
||||
sanitized_mbpp_score: 64.59
|
||||
humanevalx_naive_average: 38.78
|
||||
ds1000_naive_average: 14.19
|
||||
lcb_code_generation_pass@1: 16.5
|
||||
lcb_code_execution_pass@1: 33.82
|
||||
lcb_test_output_pass@1: 22.62
|
||||
bigcodebench_hard_instruct_pass@1: 6.08
|
||||
bigcodebench_hard_complete_pass@1: 6.76
|
||||
teval_naive_average: 79.73
|
||||
SciCode_sub_accuracy: 3.47
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 70.2
|
||||
mmlu-stem_accuracy: 67.73
|
||||
mmlu-social-science_accuracy: 75.49
|
||||
mmlu-humanities_accuracy: 68.56
|
||||
mmlu-other_accuracy: 70.58
|
||||
cmmlu_accuracy: 74.24
|
||||
cmmlu-stem_accuracy: 66.7
|
||||
cmmlu-social-science_accuracy: 75.88
|
||||
cmmlu-humanities_accuracy: 77.56
|
||||
cmmlu-other_accuracy: 77.52
|
||||
cmmlu-china-specific_accuracy: 73.46
|
||||
mmlu_pro_accuracy: 45.39
|
||||
mmlu_pro_biology_accuracy: 65.83
|
||||
mmlu_pro_business_accuracy: 51.96
|
||||
mmlu_pro_chemistry_accuracy: 36.84
|
||||
mmlu_pro_computer_science_accuracy: 48.29
|
||||
mmlu_pro_economics_accuracy: 56.16
|
||||
mmlu_pro_engineering_accuracy: 29.1
|
||||
mmlu_pro_health_accuracy: 44.5
|
||||
mmlu_pro_history_accuracy: 42.26
|
||||
mmlu_pro_law_accuracy: 24.98
|
||||
mmlu_pro_math_accuracy: 54.85
|
||||
mmlu_pro_philosophy_accuracy: 39.28
|
||||
mmlu_pro_physics_accuracy: 37.41
|
||||
mmlu_pro_psychology_accuracy: 58.27
|
||||
mmlu_pro_other_accuracy: 45.78
|
||||
humanevalx-python_pass@1: 56.1
|
||||
humanevalx-cpp_pass@1: 20.73
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 59.15
|
||||
humanevalx-js_pass@1: 57.93
|
||||
ds1000_Pandas_accuracy: 8.93
|
||||
ds1000_Numpy_accuracy: 4.09
|
||||
ds1000_Tensorflow_accuracy: 11.11
|
||||
ds1000_Scipy_accuracy: 7.55
|
||||
ds1000_Sklearn_accuracy: 7.83
|
||||
ds1000_Pytorch_accuracy: 8.82
|
||||
ds1000_Matplotlib_accuracy: 50.97
|
||||
mmmlu_lite_accuracy: 45.02
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 18.6
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.23
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.63
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 58.11
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.82
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 50.39
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.39
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 50.95
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 57.89
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.14
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
|
||||
college_naive_average: 21
|
||||
high_naive_average: 47
|
||||
middle_naive_average: 59.67
|
||||
primary_naive_average: 72.33
|
||||
arithmetic_naive_average: 62
|
||||
mathbench-a (average)_naive_average: 53.13
|
||||
college_knowledge_naive_average: 68.99
|
||||
high_knowledge_naive_average: 70.06
|
||||
middle_knowledge_naive_average: 78.53
|
||||
primary_knowledge_naive_average: 88.49
|
||||
mathbench-t (average)_naive_average: 76.51
|
||||
|
||||
|
||||
qwen2.5-7b-instruct-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 85.16
|
||||
ARC-c_accuracy: 90.85
|
||||
BoolQ_accuracy: 86.61
|
||||
triviaqa_wiki_1shot_score: 52.96
|
||||
nq_open_1shot_score: 17.62
|
||||
mmmlu_lite_naive_average: 54.7
|
||||
IFEval_Prompt-level-strict-accuracy: 71.35
|
||||
drop_accuracy: 80.23
|
||||
bbh_naive_average: 68.88
|
||||
GPQA_diamond_accuracy: 36.36
|
||||
hellaswag_accuracy: 85.49
|
||||
TheoremQA_score: 18.38
|
||||
musr_average_naive_average: 43.3
|
||||
korbench_single_naive_average: 39.44
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||
gsm8k_accuracy: 91.66
|
||||
GaokaoBench_weighted_average: 80.02
|
||||
math_accuracy: 73.74
|
||||
cmo_fib_accuracy: 22.60
|
||||
aime2024_accuracy: 13.33
|
||||
Mathbench_naive_average: 77.08
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||
cmmlu_naive_average: 75.9
|
||||
mmlu_naive_average: 76.27
|
||||
mmlu_pro_naive_average: 56.14
|
||||
openai_humaneval_humaneval_pass@1: 84.76
|
||||
sanitized_mbpp_score: 74.71
|
||||
humanevalx_naive_average: 48.17
|
||||
ds1000_naive_average: 18.57
|
||||
lcb_code_generation_pass@1: 38.75
|
||||
lcb_code_execution_pass@1: 42.38
|
||||
lcb_test_output_pass@1: 50.45
|
||||
bigcodebench_hard_instruct_pass@1: 16.89
|
||||
bigcodebench_hard_complete_pass@1: 12.16
|
||||
teval_naive_average: 79.46
|
||||
SciCode_sub_accuracy: 10.42
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.27
|
||||
mmlu-stem_accuracy: 77.75
|
||||
mmlu-social-science_accuracy: 78.65
|
||||
mmlu-humanities_accuracy: 73.12
|
||||
mmlu-other_accuracy: 75.05
|
||||
cmmlu_accuracy: 75.9
|
||||
cmmlu-stem_accuracy: 73.41
|
||||
cmmlu-social-science_accuracy: 75.97
|
||||
cmmlu-humanities_accuracy: 76.42
|
||||
cmmlu-other_accuracy: 78.15
|
||||
cmmlu-china-specific_accuracy: 73.27
|
||||
mmlu_pro_accuracy: 56.14
|
||||
mmlu_pro_biology_accuracy: 72.25
|
||||
mmlu_pro_business_accuracy: 66.16
|
||||
mmlu_pro_chemistry_accuracy: 55.65
|
||||
mmlu_pro_computer_science_accuracy: 60.24
|
||||
mmlu_pro_economics_accuracy: 66.82
|
||||
mmlu_pro_engineering_accuracy: 41.38
|
||||
mmlu_pro_health_accuracy: 54.89
|
||||
mmlu_pro_history_accuracy: 46.46
|
||||
mmlu_pro_law_accuracy: 29.06
|
||||
mmlu_pro_math_accuracy: 73.58
|
||||
mmlu_pro_philosophy_accuracy: 44.89
|
||||
mmlu_pro_physics_accuracy: 60.05
|
||||
mmlu_pro_psychology_accuracy: 61.9
|
||||
mmlu_pro_other_accuracy: 52.6
|
||||
humanevalx-python_pass@1: 51.83
|
||||
humanevalx-cpp_pass@1: 42.68
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 73.78
|
||||
humanevalx-js_pass@1: 72.56
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 8.64
|
||||
ds1000_Tensorflow_accuracy: 17.78
|
||||
ds1000_Scipy_accuracy: 15.09
|
||||
ds1000_Sklearn_accuracy: 8.7
|
||||
ds1000_Pytorch_accuracy: 4.41
|
||||
ds1000_Matplotlib_accuracy: 61.29
|
||||
mmmlu_lite_accuracy: 54.7
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 42.18
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 60
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 66.18
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 48.63
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 65.26
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 60.7
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 60.63
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 54.46
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 36
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 31.86
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
|
||||
college_naive_average: 48.33
|
||||
high_naive_average: 59.33
|
||||
middle_naive_average: 76.67
|
||||
primary_naive_average: 86.67
|
||||
arithmetic_naive_average: 74.33
|
||||
mathbench-a (average)_naive_average: 69.07
|
||||
college_knowledge_naive_average: 83.54
|
||||
high_knowledge_naive_average: 80.82
|
||||
middle_knowledge_naive_average: 83.79
|
||||
primary_knowledge_naive_average: 92.22
|
||||
mathbench-t (average)_naive_average: 85.1
|
||||
|
||||
|
||||
internlm3-8b-instruct-turbomind:
|
||||
objective:
|
||||
race-high_accuracy: 89.22
|
||||
ARC-c_accuracy: 92.54
|
||||
BoolQ_accuracy: 86.45
|
||||
triviaqa_wiki_1shot_score: 60.72
|
||||
nq_open_1shot_score: 20.25
|
||||
mmmlu_lite_naive_average: 41.82
|
||||
IFEval_Prompt-level-strict-accuracy: 77.45
|
||||
drop_accuracy: 83.27
|
||||
bbh_naive_average: 55.22
|
||||
GPQA_diamond_accuracy: 37.88
|
||||
hellaswag_accuracy: 91.28
|
||||
TheoremQA_score: 20.12
|
||||
musr_average_naive_average: 36.86
|
||||
korbench_single_naive_average: 41.2
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||
gsm8k_accuracy: 91.28
|
||||
GaokaoBench_weighted_average: 86.59
|
||||
math_accuracy: 76.96
|
||||
cmo_fib_accuracy: 38.46
|
||||
aime2024_accuracy: 13.33
|
||||
Mathbench_naive_average: 78.96
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||
cmmlu_naive_average: 83.33
|
||||
mmlu_naive_average: 76.21
|
||||
mmlu_pro_naive_average: 57.96
|
||||
openai_humaneval_humaneval_pass@1: 81.71
|
||||
sanitized_mbpp_score: 69.65
|
||||
humanevalx_naive_average: 40.73
|
||||
ds1000_naive_average: 27.23
|
||||
lcb_code_generation_pass@1: 34.75
|
||||
lcb_code_execution_pass@1: 49.9
|
||||
lcb_test_output_pass@1: 48.19
|
||||
bigcodebench_hard_instruct_pass@1: 13.51
|
||||
bigcodebench_hard_complete_pass@1: 15.54
|
||||
teval_naive_average: 82.86
|
||||
SciCode_sub_accuracy: 11.11
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.21
|
||||
mmlu-stem_accuracy: 77.7
|
||||
mmlu-social-science_accuracy: 80.98
|
||||
mmlu-humanities_accuracy: 70.83
|
||||
mmlu-other_accuracy: 75.01
|
||||
cmmlu_accuracy: 83.33
|
||||
cmmlu-stem_accuracy: 79.66
|
||||
cmmlu-social-science_accuracy: 83.39
|
||||
cmmlu-humanities_accuracy: 84.73
|
||||
cmmlu-other_accuracy: 86.2
|
||||
cmmlu-china-specific_accuracy: 81.77
|
||||
mmlu_pro_accuracy: 57.96
|
||||
mmlu_pro_biology_accuracy: 75.45
|
||||
mmlu_pro_business_accuracy: 64.64
|
||||
mmlu_pro_chemistry_accuracy: 59.81
|
||||
mmlu_pro_computer_science_accuracy: 60.24
|
||||
mmlu_pro_economics_accuracy: 68.6
|
||||
mmlu_pro_engineering_accuracy: 44.79
|
||||
mmlu_pro_health_accuracy: 58.31
|
||||
mmlu_pro_history_accuracy: 49.87
|
||||
mmlu_pro_law_accuracy: 32.43
|
||||
mmlu_pro_math_accuracy: 70.17
|
||||
mmlu_pro_philosophy_accuracy: 46.89
|
||||
mmlu_pro_physics_accuracy: 59.58
|
||||
mmlu_pro_psychology_accuracy: 66.29
|
||||
mmlu_pro_other_accuracy: 54.33
|
||||
humanevalx-python_pass@1: 43.9
|
||||
humanevalx-cpp_pass@1: 20.12
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 40.85
|
||||
humanevalx-js_pass@1: 65.24
|
||||
ds1000_Pandas_accuracy: 16.49
|
||||
ds1000_Numpy_accuracy: 34.09
|
||||
ds1000_Tensorflow_accuracy: 26.67
|
||||
ds1000_Scipy_accuracy: 17.92
|
||||
ds1000_Sklearn_accuracy: 20.87
|
||||
ds1000_Pytorch_accuracy: 19.12
|
||||
ds1000_Matplotlib_accuracy: 55.48
|
||||
mmmlu_lite_accuracy: 41.82
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 32.56
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 4.56
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 24.91
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 51.09
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 61.68
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 24.98
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 44.56
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 52.35
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.02
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 47.93
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 53.89
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 33.47
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 33.47
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
|
||||
college_naive_average: 45.67
|
||||
high_naive_average: 64.67
|
||||
middle_naive_average: 82.33
|
||||
primary_naive_average: 90.33
|
||||
arithmetic_naive_average: 74
|
||||
mathbench-a (average)_naive_average: 71.4
|
||||
college_knowledge_naive_average: 85.28
|
||||
high_knowledge_naive_average: 79.43
|
||||
middle_knowledge_naive_average: 87.9
|
||||
primary_knowledge_naive_average: 93.42
|
||||
mathbench-t (average)_naive_average: 86.51
|
||||
|
||||
|
||||
internlm3-8b-instruct-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 89.02
|
||||
ARC-c_accuracy: 93.56
|
||||
BoolQ_accuracy: 86.67
|
||||
triviaqa_wiki_1shot_score: 60.54
|
||||
nq_open_1shot_score: 20.3
|
||||
mmmlu_lite_naive_average: 42.6
|
||||
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||
drop_accuracy: 83.32
|
||||
bbh_naive_average: 54.76
|
||||
GPQA_diamond_accuracy: 33.84
|
||||
hellaswag_accuracy: 91.31
|
||||
TheoremQA_score: 18
|
||||
musr_average_naive_average: 36.62
|
||||
korbench_single_naive_average: 41.84
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||
gsm8k_accuracy: 90.67
|
||||
GaokaoBench_weighted_average: 86.27
|
||||
math_accuracy: 76.68
|
||||
cmo_fib_accuracy: 33.65
|
||||
aime2024_accuracy: 10
|
||||
Mathbench_naive_average: 78.92
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
|
||||
cmmlu_naive_average: 83.11
|
||||
mmlu_naive_average: 76.23
|
||||
mmlu_pro_naive_average: 58.16
|
||||
openai_humaneval_humaneval_pass@1: 82.32
|
||||
sanitized_mbpp_score: 70.04
|
||||
humanevalx_naive_average: 25.49
|
||||
ds1000_naive_average: 27.84
|
||||
lcb_code_generation_pass@1: 34.5
|
||||
lcb_code_execution_pass@1: 48.02
|
||||
lcb_test_output_pass@1: 47.74
|
||||
bigcodebench_hard_instruct_pass@1: 12.84
|
||||
bigcodebench_hard_complete_pass@1: 15.54
|
||||
teval_naive_average: 82.86
|
||||
SciCode_sub_accuracy: 9.38
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.23
|
||||
mmlu-stem_accuracy: 78.08
|
||||
mmlu-social-science_accuracy: 80.31
|
||||
mmlu-humanities_accuracy: 71.38
|
||||
mmlu-other_accuracy: 74.63
|
||||
cmmlu_accuracy: 83.11
|
||||
cmmlu-stem_accuracy: 79.42
|
||||
cmmlu-social-science_accuracy: 83.34
|
||||
cmmlu-humanities_accuracy: 83.95
|
||||
cmmlu-other_accuracy: 86.22
|
||||
cmmlu-china-specific_accuracy: 81.5
|
||||
mmlu_pro_accuracy: 58.16
|
||||
mmlu_pro_biology_accuracy: 74.62
|
||||
mmlu_pro_business_accuracy: 65.02
|
||||
mmlu_pro_chemistry_accuracy: 60.69
|
||||
mmlu_pro_computer_science_accuracy: 61.46
|
||||
mmlu_pro_economics_accuracy: 68.25
|
||||
mmlu_pro_engineering_accuracy: 45.3
|
||||
mmlu_pro_health_accuracy: 60.15
|
||||
mmlu_pro_history_accuracy: 50.66
|
||||
mmlu_pro_law_accuracy: 31.7
|
||||
mmlu_pro_math_accuracy: 70.32
|
||||
mmlu_pro_philosophy_accuracy: 47.7
|
||||
mmlu_pro_physics_accuracy: 59.51
|
||||
mmlu_pro_psychology_accuracy: 65.41
|
||||
mmlu_pro_other_accuracy: 53.46
|
||||
humanevalx-python_pass@1: 42.68
|
||||
humanevalx-cpp_pass@1: 19.51
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 0.00
|
||||
humanevalx-js_pass@1: 64.02
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 35
|
||||
ds1000_Tensorflow_accuracy: 24.44
|
||||
ds1000_Scipy_accuracy: 20.75
|
||||
ds1000_Sklearn_accuracy: 21.74
|
||||
ds1000_Pytorch_accuracy: 22.06
|
||||
ds1000_Matplotlib_accuracy: 56.77
|
||||
mmmlu_lite_accuracy: 42.6
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 32.84
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 10.46
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 24.56
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 50.95
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 61.05
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 30.6
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 45.89
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 51.79
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.65
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 48.77
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 52.7
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.91
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.84
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
|
||||
college_naive_average: 47
|
||||
high_naive_average: 66.67
|
||||
middle_naive_average: 81.67
|
||||
primary_naive_average: 89.33
|
||||
arithmetic_naive_average: 73.67
|
||||
mathbench-a (average)_naive_average: 71.67
|
||||
college_knowledge_naive_average: 82.91
|
||||
high_knowledge_naive_average: 79.86
|
||||
middle_knowledge_naive_average: 88.92
|
||||
primary_knowledge_naive_average: 92.96
|
||||
mathbench-t (average)_naive_average: 86.16
|
||||
|
272
.github/scripts/oc_score_baseline_testrange.yaml
vendored
272
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -1,7 +1,7 @@
|
||||
chat:
|
||||
glm-4-9b-chat-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 90.62
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 84.38
|
||||
glm-4-9b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 90.62
|
||||
@ -11,11 +11,14 @@ chat:
|
||||
deepseek-7b-chat-hf:
|
||||
gsm8k_accuracy: 46.88
|
||||
race-high_accuracy: 81.25
|
||||
deepseek-moe-16b-chat-hf:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 68.75
|
||||
deepseek-r1-distill-llama-8b-turbomind:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 81.25
|
||||
deepseek-r1-distill-qwen-1_5b-turbomind:
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 53.12
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k_accuracy: 50
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 78.12
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k_accuracy: 50
|
||||
@ -30,40 +33,46 @@ chat:
|
||||
gsm8k_accuracy: 40.62
|
||||
race-high_accuracy: 68.75
|
||||
gemma-2-9b-it-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 84.38
|
||||
gemma-2-27b-it-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k_accuracy: 46.88
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 68.75
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
internlm3-8b-instruct-hf:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 87.5
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
gsm8k_accuracy: 87.50
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 90.62
|
||||
internlm2-chat-1.8b-turbomind:
|
||||
gsm8k_accuracy: 28.12
|
||||
gsm8k_accuracy: 25.00
|
||||
race-high_accuracy: 84.38
|
||||
internlm2-chat-1.8b-sft-turbomind:
|
||||
gsm8k_accuracy: 21.88
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 84.38
|
||||
internlm2-chat-7b-lmdeploy:
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 84.38
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 87.50
|
||||
internlm2-chat-7b-sft-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 90.62
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 87.50
|
||||
internlm3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 87.5
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 84.38
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 87.50
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
@ -72,14 +81,14 @@ chat:
|
||||
gsm8k_accuracy: 18.75
|
||||
race-high_accuracy: 46.88
|
||||
llama-3_1-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 87.5
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 84.38
|
||||
mistral-7b-instruct-v0.2-hf:
|
||||
gsm8k_accuracy: 40.62
|
||||
race-high_accuracy: 75
|
||||
@ -91,16 +100,13 @@ chat:
|
||||
race-high_accuracy: 81.25
|
||||
mistral-nemo-instruct-2407-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 78.12
|
||||
race-high_accuracy: 75
|
||||
mistral-7b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 68.75
|
||||
race-high_accuracy: 65.62
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 75
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 78.12
|
||||
qwen2.5-0.5b-instruct-hf:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 46.88
|
||||
@ -109,9 +115,9 @@ chat:
|
||||
race-high_accuracy: 90.62
|
||||
qwen2.5-0.5b-instruct-turbomind:
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 50
|
||||
race-high_accuracy: 43.75
|
||||
qwen2.5-3b-instruct-turbomind:
|
||||
gsm8k_accuracy: 59.38
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 90.62
|
||||
qwen1.5-0.5b-chat-hf:
|
||||
gsm8k_accuracy: 0
|
||||
@ -123,13 +129,13 @@ chat:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 90.62
|
||||
qwen2-1.5b-instruct-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 84.38
|
||||
qwen2-7b-instruct-turbomind:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 90.62
|
||||
gsm8k_accuracy: 75.00
|
||||
race-high_accuracy: 87.50
|
||||
qwen1.5-0.5b-chat-vllm:
|
||||
gsm8k_accuracy: 3.12
|
||||
gsm8k_accuracy: 6.25
|
||||
race-high_accuracy: 53.12
|
||||
yi-1.5-6b-chat-hf:
|
||||
gsm8k_accuracy: 65.62
|
||||
@ -138,16 +144,16 @@ chat:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 93.75
|
||||
yi-1.5-6b-chat-turbomind:
|
||||
gsm8k_accuracy: 62.5
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 84.38
|
||||
yi-1.5-9b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
deepseek-v2-lite-chat-hf:
|
||||
gsm8k_accuracy: 46.88
|
||||
deepseek-v2_lite-chat-turbomind:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 71.88
|
||||
gemma2-27b-it-hf:
|
||||
gsm8k_accuracy: 75
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 93.75
|
||||
internlm2_5-20b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -159,49 +165,53 @@ chat:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
mistral-small-instruct-2409-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 87.50
|
||||
phi-4:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
qwen2.5-14b-instruct-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 96.88
|
||||
qwen2.5-14b-instruct-turbomind:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 93.75
|
||||
yi-1.5-34b-chat-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
deepseek-67b-chat-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 78.12
|
||||
race-high_accuracy: 96.88
|
||||
yi-1.5-34b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 93.75
|
||||
deepseek-67b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 75.00
|
||||
deepseek-r1-distill-qwen-32b-turbomind:
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_3-70b-instruct-turbomind:
|
||||
gsm8k_accuracy: 93.75
|
||||
race-high_accuracy: 87.5
|
||||
mixtral-8x7b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-large-instruct-2411-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
gsm8k_accuracy: 87.50
|
||||
race-high_accuracy: 93.75
|
||||
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
||||
gsm8k_accuracy: 87.5
|
||||
race-high_accuracy: 46.88
|
||||
gsm8k_accuracy: 90.62
|
||||
race-high_accuracy: 53.12
|
||||
qwen2.5-72b-instruct-turbomind:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 93.75
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 90.62
|
||||
deepseek-r1-distill-llama-70b-turbomind:
|
||||
gsm8k_accuracy: 50.00
|
||||
race-high_accuracy: 87.50
|
||||
deepseek-v2_5-1210-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
race-high_accuracy: 84.38
|
||||
mixtral-8x22b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
race-high_accuracy: 78.12
|
||||
mixtral-8x22b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 78.12
|
||||
base:
|
||||
glm-4-9b-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
glm-4-9b-turbomind:
|
||||
gsm8k_accuracy: 62.5
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
@ -210,18 +220,13 @@ base:
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 46.88
|
||||
winogrande_accuracy: 71.88
|
||||
deepseek-moe-16b-base-hf:
|
||||
gsm8k_accuracy: 21.88
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 21.88
|
||||
winogrande_accuracy: 65.62
|
||||
deepseek-7b-base-turbomind:
|
||||
gsm8k_accuracy: 21.88
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 46.88
|
||||
gsm8k_accuracy: 18.75
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 50.00
|
||||
winogrande_accuracy: 84.38
|
||||
deepseek-moe-16b-base-vllm:
|
||||
gsm8k_accuracy: 21.88
|
||||
gsm8k_accuracy: 25.00
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 25
|
||||
winogrande_accuracy: 68.75
|
||||
@ -245,16 +250,21 @@ base:
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 65.62
|
||||
winogrande_accuracy: 71.88
|
||||
gemma-2-9b-turbomind:
|
||||
gsm8k_accuracy: 68.75
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 84.38
|
||||
winogrande_accuracy: 81.25
|
||||
gemma-2b-vllm:
|
||||
gsm8k_accuracy: 15.62
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy:
|
||||
winogrande_accuracy:
|
||||
race-high_accuracy: 28.12
|
||||
winogrande_accuracy: 68.75
|
||||
gemma-7b-vllm:
|
||||
gsm8k_accuracy: 53.12
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
race-high_accuracy:
|
||||
winogrande_accuracy:
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 81.25
|
||||
internlm2_5-7b-hf:
|
||||
gsm8k_accuracy: 37.5
|
||||
GPQA_diamond_accuracy: 25
|
||||
@ -265,31 +275,26 @@ base:
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 62.5
|
||||
winogrande_accuracy: 78.12
|
||||
internlm2-base-7b-hf:
|
||||
gsm8k_accuracy: 3.12
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 75
|
||||
winogrande_accuracy: 65.62
|
||||
internlm2-1.8b-turbomind:
|
||||
gsm8k_accuracy: 12.5
|
||||
gsm8k_accuracy: 12.50
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
race-high_accuracy: 71.88
|
||||
winogrande_accuracy: 78.12
|
||||
winogrande_accuracy: 75
|
||||
internlm2_5-7b-turbomind:
|
||||
gsm8k_accuracy: 62.50
|
||||
GPQA_diamond_accuracy: 34.38
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 87.50
|
||||
winogrande_accuracy: 87.5
|
||||
internlm2-7b-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 71.88
|
||||
winogrande_accuracy: 84.38
|
||||
GPQA_diamond_accuracy: 25.00
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 71.88
|
||||
internlm2-base-7b-turbomind:
|
||||
gsm8k_accuracy: 37.50
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 75
|
||||
gsm8k_accuracy: 25.00
|
||||
GPQA_diamond_accuracy: 34.38
|
||||
race-high_accuracy: 71.88
|
||||
winogrande_accuracy: 62.50
|
||||
llama-2-7b-hf:
|
||||
gsm8k_accuracy: 21.88
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
@ -311,10 +316,10 @@ base:
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 78.12
|
||||
llama-3-8b-turbomind:
|
||||
gsm8k_accuracy: 50
|
||||
gsm8k_accuracy: 46.88
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 65.62
|
||||
winogrande_accuracy: 78.12
|
||||
winogrande_accuracy: 81.25
|
||||
mistral-7b-v0.3-hf:
|
||||
gsm8k_accuracy: 31.25
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
@ -326,15 +331,15 @@ base:
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
qwen2.5-1.5b-turbomind:
|
||||
gsm8k_accuracy: 62.50
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 68.75
|
||||
qwen2.5-7b-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
GPQA_diamond_accuracy: 25
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
qwen2.5-7b-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 75.00
|
||||
qwen1.5-moe-a2.7b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
@ -357,19 +362,19 @@ base:
|
||||
winogrande_accuracy: 68.75
|
||||
qwen2-1.5b-turbomind:
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 75
|
||||
qwen2-7b-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
gsm8k_accuracy: 65.62
|
||||
GPQA_diamond_accuracy: 12.5
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
winogrande_accuracy: 75
|
||||
qwen1.5-0.5b-vllm:
|
||||
gsm8k_accuracy: 9.38
|
||||
GPQA_diamond_accuracy: 0
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 56.25
|
||||
winogrande_accuracy: 62.5
|
||||
winogrande_accuracy: 59.38
|
||||
yi-1.5-6b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
@ -381,28 +386,13 @@ base:
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 59.38
|
||||
yi-1.5-9b-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
gsm8k_accuracy: 75.00
|
||||
GPQA_diamond_accuracy: 40.62
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
deepseek-v2-lite-hf:
|
||||
gsm8k_accuracy: 31.25
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 59.38
|
||||
winogrande_accuracy: 71.88
|
||||
internlm2-20b-hf:
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 68.75
|
||||
winogrande_accuracy: 75
|
||||
internlm2-base-20b-hf:
|
||||
gsm8k_accuracy: 12.5
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
race-high_accuracy: 84.38
|
||||
winogrande_accuracy: 65.62
|
||||
internlm2-20b-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 68.75
|
||||
winogrande_accuracy: 81.25
|
||||
qwen2.5-14b-hf:
|
||||
@ -416,37 +406,27 @@ base:
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 78.12
|
||||
qwen2.5-32b-turbomind:
|
||||
gsm8k_accuracy: 84.38
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
gsm8k_accuracy: 90.62
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 81.25
|
||||
deepseek-67b-base-hf:
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 90.62
|
||||
deepseek-67b-base-turbomind:
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 84.38
|
||||
gsm8k_accuracy: 62.50
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 81.25
|
||||
llama-3-70b-turbomind:
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
qwen2.5-72b-turbomind:
|
||||
gsm8k_accuracy: 84.38
|
||||
GPQA_diamond_accuracy: 34.38
|
||||
GPQA_diamond_accuracy: 40.62
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 87.5
|
||||
deepseek-v2-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
llama-3-70b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
winogrande_accuracy: 81.25
|
||||
|
69
.github/workflows/daily-run-test.yml
vendored
69
.github/workflows/daily-run-test.yml
vendored
@ -17,7 +17,7 @@ on:
|
||||
required: false
|
||||
description: 'whether to build lmdeploy'
|
||||
type: boolean
|
||||
default: true
|
||||
default: false
|
||||
repo_org_lmdeploy:
|
||||
required: false
|
||||
description: 'Tested repository organization name. Default is internlm/lmdeploy'
|
||||
@ -44,7 +44,7 @@ on:
|
||||
type: string
|
||||
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
||||
schedule:
|
||||
- cron: '15 14 * * 0,2'
|
||||
- cron: '15 14 * * 0,3'
|
||||
|
||||
env:
|
||||
HF_DATASETS_OFFLINE: 1
|
||||
@ -54,13 +54,16 @@ env:
|
||||
LMDEPLOY_USE_MODELSCOPE: false
|
||||
HF_HUB_OFFLINE: 1
|
||||
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
|
||||
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
|
||||
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
|
||||
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
|
||||
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
|
||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
CONDA_PATH: ${{ secrets.WORKSPACE_PREFIX }}/miniconda3
|
||||
PIP_CACHE_PATH: ${{ secrets.WORKSPACE_PREFIX }}/.cache/pip
|
||||
REPORT_ROOT: ${{ secrets.WORKSPACE_PREFIX }}/eval_report/regression
|
||||
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache
|
||||
HUGGINGFACE_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||
HF_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||
HF_DATASETS_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/hf_datasets_cache
|
||||
HF_ENDPOINT: https://hf-mirror.com
|
||||
CONDA_ENV: regression_test
|
||||
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
|
||||
jobs:
|
||||
build-pypi:
|
||||
@ -92,7 +95,6 @@ jobs:
|
||||
matrix:
|
||||
pyver: [py310]
|
||||
runs-on: ubuntu-latest
|
||||
environment: 'prod'
|
||||
env:
|
||||
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||
PLAT_NAME: manylinux2014_x86_64
|
||||
@ -126,7 +128,6 @@ jobs:
|
||||
if: ${{!cancelled()}}
|
||||
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||
runs-on: volc_cu12
|
||||
environment: 'prod'
|
||||
timeout-minutes: 120 #2hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -141,24 +142,26 @@ jobs:
|
||||
- name: Remove Conda Env
|
||||
if: always()
|
||||
run: |
|
||||
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
|
||||
. ${{ secrets.WORKSPACE_PREFIX }}/miniconda3/bin/activate
|
||||
conda env remove -y --name ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
- name: Prepare - create conda env and install torch - cu12
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
max_attempts: 1
|
||||
max_attempts: 3
|
||||
timeout_minutes: 120
|
||||
command: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install -r ${{ secrets.WORKSPACE_PREFIX }}/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install ${{ secrets.WORKSPACE_PREFIX }}/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
@ -181,14 +184,13 @@ jobs:
|
||||
pip list
|
||||
|
||||
daily_run_test_volc:
|
||||
if: ${{!cancelled()}}
|
||||
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||
needs: prepare_env
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||
runs-on: volc_cu12_daily
|
||||
environment: 'prod'
|
||||
timeout-minutes: 180 #3hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -205,8 +207,8 @@ jobs:
|
||||
- name: modify config
|
||||
if: matrix.regression_func != 'chat_sub_fullbench'
|
||||
run: |
|
||||
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
|
||||
cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||
- name: Run test
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
@ -222,14 +224,13 @@ jobs:
|
||||
|
||||
|
||||
daily_run_test_local:
|
||||
if: ${{!cancelled()}}
|
||||
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||
needs: prepare_env
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
||||
runs-on: volc_cu12_local
|
||||
environment: 'prod'
|
||||
timeout-minutes: 480 #6hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -246,8 +247,8 @@ jobs:
|
||||
- name: modify config
|
||||
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||
run: |
|
||||
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
|
||||
cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||
- name: Run command testcase
|
||||
if: matrix.regression_func == 'cmd'
|
||||
run: |
|
||||
@ -256,27 +257,33 @@ jobs:
|
||||
conda info --envs
|
||||
export from_tf=TRUE
|
||||
python tools/list_configs.py internlm2_5 mmlu
|
||||
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run model test - api
|
||||
if: matrix.regression_func == 'api'
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 180s
|
||||
env | grep PROXY
|
||||
env | grep proxy
|
||||
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
@ -287,7 +294,7 @@ jobs:
|
||||
- name: Run testcase
|
||||
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||
env:
|
||||
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
|
||||
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache_subset
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
@ -298,14 +305,13 @@ jobs:
|
||||
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
|
||||
fullbench_run_test:
|
||||
if: ${{!cancelled()}}
|
||||
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||
needs: prepare_env
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
||||
runs-on: volc_cu12
|
||||
environment: 'prod'
|
||||
timeout-minutes: 480 #6hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -329,7 +335,7 @@ jobs:
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
export from_tf=TRUE
|
||||
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
|
||||
opencompass ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
|
||||
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
|
||||
@ -339,7 +345,6 @@ jobs:
|
||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
environment: 'prod'
|
||||
steps:
|
||||
- name: notify
|
||||
run: |
|
||||
|
2
.github/workflows/pr-run-test.yml
vendored
2
.github/workflows/pr-run-test.yml
vendored
@ -45,7 +45,7 @@ jobs:
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
python3 -m pip uninstall opencompass -y
|
||||
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
- name: conda env
|
||||
run: |
|
||||
|
@ -115,6 +115,12 @@ repos:
|
||||
args:
|
||||
- --root_folder
|
||||
- opencompass/configs/datasets
|
||||
- repo: https://gitee.com/mirrors/gitleaks
|
||||
rev: v8.23.1
|
||||
hooks:
|
||||
- id: gitleaks
|
||||
entry: "gitleaks dir"
|
||||
args: ["--verbose", "--redact=50"]
|
||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||
# rev: v0.2.0 # Use the ref you want to point at
|
||||
# hooks:
|
||||
|
@ -8,6 +8,7 @@ exclude: |
|
||||
opencompass/datasets/lawbench/utils|
|
||||
opencompass/datasets/lawbench/evaluation_functions/|
|
||||
opencompass/datasets/medbench/|
|
||||
opencompass/datasets/matbench/|
|
||||
opencompass/datasets/teval/|
|
||||
opencompass/datasets/NPHardEval/|
|
||||
opencompass/datasets/TheoremQA|
|
||||
@ -115,6 +116,12 @@ repos:
|
||||
args:
|
||||
- --root_folder
|
||||
- opencompass/configs/datasets
|
||||
- repo: https://github.com/gitleaks/gitleaks
|
||||
rev: v8.23.1
|
||||
hooks:
|
||||
- id: gitleaks
|
||||
entry: "gitleaks dir"
|
||||
args: ["--verbose", "--redact=50"]
|
||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||
# rev: v0.2.0 # Use the ref you want to point at
|
||||
# hooks:
|
||||
|
107
README.md
107
README.md
@ -57,9 +57,10 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
||||
|
||||
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||
|
||||
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
|
||||
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
|
||||
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
|
||||
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
|
||||
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
|
||||
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
|
||||
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
|
||||
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
|
||||
@ -176,69 +177,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
|
||||
|
||||
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
|
||||
|
||||
- Your first evaluation with OpenCompass!
|
||||
### Your first evaluation with OpenCompass!
|
||||
|
||||
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
|
||||
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
|
||||
|
||||
```bash
|
||||
# CLI
|
||||
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
|
||||
```bash
|
||||
# CLI
|
||||
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
|
||||
|
||||
# Python scripts
|
||||
opencompass examples/eval_chat_demo.py
|
||||
```
|
||||
# Python scripts
|
||||
opencompass examples/eval_chat_demo.py
|
||||
```
|
||||
|
||||
You can find more script examples under [examples](./examples) folder.
|
||||
You can find more script examples under [examples](./examples) folder.
|
||||
|
||||
- API evaluation
|
||||
### API evaluation
|
||||
|
||||
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
|
||||
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
|
||||
# CLI
|
||||
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
|
||||
```bash
|
||||
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
|
||||
# CLI
|
||||
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
|
||||
|
||||
# Python scripts
|
||||
opencompass examples/eval_api_demo.py
|
||||
# Python scripts
|
||||
opencompass examples/eval_api_demo.py
|
||||
|
||||
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
|
||||
```
|
||||
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
|
||||
```
|
||||
|
||||
- Accelerated Evaluation
|
||||
### Accelerated Evaluation
|
||||
|
||||
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
|
||||
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
|
||||
|
||||
```bash
|
||||
# CLI
|
||||
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
|
||||
```bash
|
||||
# CLI
|
||||
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
|
||||
|
||||
# Python scripts
|
||||
opencompass examples/eval_lmdeploy_demo.py
|
||||
```
|
||||
# Python scripts
|
||||
opencompass examples/eval_lmdeploy_demo.py
|
||||
```
|
||||
|
||||
- Supported Models
|
||||
### Supported Models and Datasets
|
||||
|
||||
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
||||
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
||||
|
||||
```bash
|
||||
# List all configurations
|
||||
python tools/list_configs.py
|
||||
# List all configurations related to llama and mmlu
|
||||
python tools/list_configs.py llama mmlu
|
||||
```
|
||||
```bash
|
||||
# List all configurations
|
||||
python tools/list_configs.py
|
||||
# List all configurations related to llama and mmlu
|
||||
python tools/list_configs.py llama mmlu
|
||||
```
|
||||
|
||||
If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
|
||||
#### Supported Models
|
||||
|
||||
```bash
|
||||
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
|
||||
```
|
||||
If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
|
||||
|
||||
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
|
||||
```bash
|
||||
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
|
||||
```
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
|
||||
```
|
||||
#### Supported Datasets
|
||||
|
||||
Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
|
||||
|
||||
```bash
|
||||
# Recommended Evaluation Config based on Rules
|
||||
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
|
||||
|
||||
# Recommended Evaluation Config based on LLM Judge
|
||||
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
|
||||
```
|
||||
|
||||
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
|
||||
```
|
||||
|
||||
> \[!TIP\]
|
||||
>
|
||||
@ -286,7 +301,9 @@ We have supported a statistical list of all datasets that can be used on this pl
|
||||
|
||||
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
|
||||
|
||||
Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
|
||||
In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
|
||||
|
||||
Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
|
||||
|
||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||
|
||||
|
@ -57,9 +57,10 @@
|
||||
|
||||
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||
|
||||
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
|
||||
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
|
||||
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
|
||||
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
|
||||
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
|
||||
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
|
||||
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
|
||||
- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。
|
||||
- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥
|
||||
@ -208,9 +209,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
|
||||
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
|
||||
```
|
||||
|
||||
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
|
||||
- ### 支持的模型与数据集
|
||||
|
||||
- ### 支持的模型
|
||||
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
|
||||
|
||||
```bash
|
||||
# 列出所有配置
|
||||
@ -219,13 +220,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
|
||||
python tools/list_configs.py llama mmlu
|
||||
```
|
||||
|
||||
如果模型不在列表中但支持 Huggingface AutoModel 类,您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
|
||||
#### 支持的模型
|
||||
|
||||
如果模型不在列表中,但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装(详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)),您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
|
||||
|
||||
```bash
|
||||
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
|
||||
```
|
||||
|
||||
如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
|
||||
#### 支持的数据集
|
||||
|
||||
目前,OpenCompass针对数据集给出了标准的推荐配置。通常,`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
|
||||
|
||||
```bash
|
||||
# 基于规则的推荐配置
|
||||
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
|
||||
|
||||
# 基于LLM Judge的推荐配置
|
||||
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
|
||||
```
|
||||
|
||||
此外,如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
|
||||
@ -281,7 +296,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
|
||||
|
||||
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
|
||||
|
||||
详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
|
||||
详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
|
||||
|
||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -49,7 +49,7 @@ export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
|
||||
|
||||
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
|
||||
|
||||
### ### Using LLM for Evaluation via Configuration Files
|
||||
### Using LLM for Evaluation via Configuration Files
|
||||
|
||||
To set up an LLM judge evaluation, you'll need to configure three main components:
|
||||
|
||||
@ -264,6 +264,107 @@ Example evaluation output:
|
||||
}
|
||||
```
|
||||
|
||||
## CascadeEvaluator
|
||||
|
||||
OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
|
||||
|
||||
1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
|
||||
|
||||
2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
|
||||
|
||||
### Configuring CascadeEvaluator
|
||||
|
||||
Here's an example of how to configure the CascadeEvaluator:
|
||||
|
||||
```python
|
||||
# Define a rule-based evaluator
|
||||
rule_evaluator = dict(type=MATHVerifyEvaluator)
|
||||
|
||||
# Define an LLM judge evaluator
|
||||
llm_judge_evaluator = dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=YourDataset,
|
||||
path='path/to/your/dataset',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(), # Can use environment variables to configure the judge model
|
||||
)
|
||||
|
||||
# Configure cascade evaluator (cascade mode)
|
||||
cascade_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
llm_evaluator=llm_judge_evaluator,
|
||||
rule_evaluator=rule_evaluator,
|
||||
parallel=False # Cascade mode
|
||||
)
|
||||
|
||||
# For parallel mode, set parallel=True
|
||||
parallel_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
llm_evaluator=llm_judge_evaluator,
|
||||
rule_evaluator=rule_evaluator,
|
||||
parallel=True # Parallel mode
|
||||
)
|
||||
|
||||
# Use the cascade evaluator in your dataset evaluation config
|
||||
eval_cfg = dict(evaluator=cascade_evaluator)
|
||||
```
|
||||
|
||||
### Evaluation Results
|
||||
|
||||
The cascade evaluator outputs detailed evaluation statistics including:
|
||||
|
||||
- Accuracy of the rule-based evaluation
|
||||
- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
|
||||
- Final combined accuracy
|
||||
|
||||
Example output:
|
||||
|
||||
```python
|
||||
{
|
||||
'accuracy': 85.0, # Final accuracy
|
||||
'cascade_stats': {
|
||||
'total_samples': 100,
|
||||
'rule_correct': 70, # Number of samples correct by rule evaluation
|
||||
'rule_accuracy': 70.0, # Accuracy of rule evaluation
|
||||
'llm_evaluated': 30, # Number of samples evaluated by LLM (failed samples in cascade mode)
|
||||
'llm_correct': 15, # Number of samples correct by LLM evaluation
|
||||
'llm_accuracy': 50.0, # Accuracy of LLM evaluation
|
||||
'final_correct': 85, # Total correct samples
|
||||
'final_accuracy': 85.0, # Final accuracy
|
||||
'parallel_mode': False, # Whether parallel mode was used
|
||||
},
|
||||
'details': [
|
||||
# Detailed evaluation results for each sample
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The cascade evaluator is particularly useful for:
|
||||
|
||||
1. Scenarios that require balancing evaluation cost and accuracy
|
||||
2. Cases where rule-based evaluators are available but might not be comprehensive
|
||||
3. Evaluation tasks that need more nuanced judgment for edge cases
|
||||
|
||||
## Complete Example
|
||||
|
||||
For a complete working example, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving using an LLM judge.
|
||||
For a complete working example using GenericLLMEvaluator
|
||||
, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
|
||||
|
||||
For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Introduction
|
||||
|
||||
Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
|
||||
Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHVerifyEvaluator components.
|
||||
|
||||
## Dataset Format
|
||||
|
||||
@ -61,7 +61,7 @@ math_infer_cfg = dict(
|
||||
|
||||
```python
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator),
|
||||
evaluator=dict(type=MATHVerifyEvaluator),
|
||||
)
|
||||
```
|
||||
|
||||
@ -86,11 +86,11 @@ math_datasets = [
|
||||
]
|
||||
```
|
||||
|
||||
## MATHEvaluator
|
||||
## MATHVerifyEvaluator
|
||||
|
||||
The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
|
||||
The MATHVerifyEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
|
||||
|
||||
The MATHEvaluator implements:
|
||||
The MATHVerifyEvaluator implements:
|
||||
|
||||
1. Extracts answers from both predictions and references using LaTeX extraction
|
||||
2. Handles various LaTeX formats and environments
|
||||
@ -133,7 +133,7 @@ Here's a complete example of how to set up math evaluation:
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
|
||||
from opencompass.openicl.icl_evaluator.math_evaluator import MATHVerifyEvaluator
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
@ -160,7 +160,7 @@ math_infer_cfg = dict(
|
||||
|
||||
# Evaluation configuration
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator),
|
||||
evaluator=dict(type=MATHVerifyEvaluator),
|
||||
)
|
||||
|
||||
# Dataset configuration
|
||||
|
@ -117,6 +117,10 @@ html_js_files = [
|
||||
'js/custom.js'
|
||||
]
|
||||
|
||||
html_context = {
|
||||
'github_version': 'main',
|
||||
}
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
|
@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass.
|
||||
|
||||
You can use sorting and search functions to find the dataset you need.
|
||||
|
||||
We provide recommended running configurations for each dataset,
|
||||
and in some datasets also offer recommended configurations based on LLM Judge.
|
||||
|
||||
You can quickly start evaluation tasks based on the recommended configurations.
|
||||
However, please note that these configurations may be updated over time.
|
||||
|
||||
"""
|
||||
|
||||
with open('dataset_statistics.md', 'w') as f:
|
||||
@ -24,7 +30,13 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
|
||||
with open(load_path, 'r') as f2:
|
||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||
|
||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
||||
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
|
||||
|
||||
recommanded_dataset_list = [
|
||||
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
|
||||
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
|
||||
'mmlu_pro', 'musr', 'math500'
|
||||
]
|
||||
|
||||
|
||||
def table_format(data_list):
|
||||
@ -32,18 +44,30 @@ def table_format(data_list):
|
||||
for i in data_list:
|
||||
table_format_list_sub = []
|
||||
for j in i:
|
||||
if j in recommanded_dataset_list:
|
||||
link_token = '[link]('
|
||||
else:
|
||||
link_token = '[link(TBD)]('
|
||||
|
||||
for index in HEADER:
|
||||
if index == 'paper':
|
||||
table_format_list_sub.append('[link](' + i[j][index] + ')')
|
||||
elif index == 'configpath_llmjudge':
|
||||
if i[j][index] == '':
|
||||
table_format_list_sub.append(i[j][index])
|
||||
else:
|
||||
table_format_list_sub.append(link_token +
|
||||
GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
elif index == 'configpath':
|
||||
if isinstance(i[j][index], list):
|
||||
sub_list_text = ''
|
||||
for k in i[j][index]:
|
||||
sub_list_text += ('[link](' + GITHUB_PREFIX + k +
|
||||
sub_list_text += (link_token + GITHUB_PREFIX + k +
|
||||
') / ')
|
||||
table_format_list_sub.append(sub_list_text[:-2])
|
||||
else:
|
||||
table_format_list_sub.append('[link](' +
|
||||
table_format_list_sub.append(link_token +
|
||||
GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
else:
|
||||
@ -61,7 +85,10 @@ def generate_table(data_list, title=None):
|
||||
if title is not None:
|
||||
f.write(f'\n{title}')
|
||||
f.write("""\n```{table}\n:class: dataset\n""")
|
||||
header = ['Name', 'Category', 'Paper or Repository', 'Config File']
|
||||
header = [
|
||||
'Name', 'Category', 'Paper or Repository', 'Recommended Config',
|
||||
'Recommended Config (LLM Judge)'
|
||||
]
|
||||
table_cfg = dict(tablefmt='pipe',
|
||||
floatfmt='.2f',
|
||||
numalign='right',
|
||||
|
@ -57,7 +57,7 @@ The parameter explanation is as follows:
|
||||
- `-w`: Specify the working path, default is `./outputs/default`.
|
||||
- `-l`: Enable status reporting via Lark bot.
|
||||
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
|
||||
- `--dump-eval-details`: When enabled,evaluation under the `results` folder will include more details, such as the correctness of each sample.
|
||||
- `--dump-eval-details`: Default enabled,evaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。
|
||||
|
||||
Using run mode `-m all` as an example, the overall execution flow is as follows:
|
||||
|
||||
|
@ -263,6 +263,106 @@ GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。
|
||||
}
|
||||
```
|
||||
|
||||
## 级联评估器 (CascadeEvaluator)
|
||||
|
||||
OpenCompass还提供了级联评估器`CascadeEvaluator`,它结合了规则式评估和LLM评估的优势。级联评估器有两种模式:
|
||||
|
||||
1. **级联模式(Cascade Mode, parallel=False)**:首先使用规则式评估器评估所有样本,然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖,从而降低评估成本和时间。
|
||||
|
||||
2. **并行模式(Parallel Mode, parallel=True)**:使用规则式评估器和LLM评判器同时评估所有样本,如果任何一个评估器认为样本是正确的,则将该样本视为正确。这种方式可以提高评估的宽容度,但可能会导致更高的成本,因为所有样本都需要LLM评估。
|
||||
|
||||
### 配置CascadeEvaluator
|
||||
|
||||
以下是配置`CascadeEvaluator`的示例:
|
||||
|
||||
```python
|
||||
# 定义规则式评估器
|
||||
rule_evaluator = dict(type=MATHVerifyEvaluator)
|
||||
|
||||
# 定义LLM评判器
|
||||
llm_judge_evaluator = dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="你是一个负责评估模型输出正确性和质量的助手。",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=YourDataset,
|
||||
path='path/to/your/dataset',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(), # 可以使用环境变量配置评判模型
|
||||
)
|
||||
|
||||
# 配置级联评估器(级联模式)
|
||||
cascade_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
llm_evaluator=llm_judge_evaluator,
|
||||
rule_evaluator=rule_evaluator,
|
||||
parallel=False # 级联模式
|
||||
)
|
||||
|
||||
# 如果需要并行模式,可以设置parallel=True
|
||||
parallel_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
llm_evaluator=llm_judge_evaluator,
|
||||
rule_evaluator=rule_evaluator,
|
||||
parallel=True # 并行模式
|
||||
)
|
||||
|
||||
# 在数据集评估配置中使用级联评估器
|
||||
eval_cfg = dict(evaluator=cascade_evaluator)
|
||||
```
|
||||
|
||||
### 评估结果
|
||||
|
||||
级联评估器会输出详细的评估统计信息,包括:
|
||||
|
||||
- 规则评估的准确率
|
||||
- LLM评估的准确率(针对规则评估失败的样本)
|
||||
- 最终的综合准确率
|
||||
|
||||
输出示例:
|
||||
|
||||
```python
|
||||
{
|
||||
'accuracy': 85.0, # 最终准确率
|
||||
'cascade_stats': {
|
||||
'total_samples': 100,
|
||||
'rule_correct': 70, # 规则评估认为正确的样本数
|
||||
'rule_accuracy': 70.0, # 规则评估的准确率
|
||||
'llm_evaluated': 30, # LLM评估的样本数(级联模式下为规则评估失败的样本数)
|
||||
'llm_correct': 15, # LLM评估认为正确的样本数
|
||||
'llm_accuracy': 50.0, # LLM评估的准确率
|
||||
'final_correct': 85, # 最终正确的样本数
|
||||
'final_accuracy': 85.0, # 最终准确率
|
||||
'parallel_mode': False, # 是否是并行模式
|
||||
},
|
||||
'details': [
|
||||
# 每个样本的详细评估结果
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
级联评估器特别适用于:
|
||||
|
||||
1. 需要平衡评估成本和准确性的场景
|
||||
2. 有可用的规则式评估器但可能不够完善的情况
|
||||
3. 需要对边界情况进行更精确判断的评估任务
|
||||
|
||||
## 完整示例
|
||||
|
||||
有关完整的工作示例,请参考examples目录中的`eval_llm_judge.py`文件,该文件演示了如何使用LLM评判器评估数学问题解决能力。
|
||||
如果希望了解通用LLM评判器,请参考examples目录中的`eval_llm_judge.py`文件,该示例展示了如何使用LLM评判器评估数学问题。
|
||||
|
||||
如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件,该示例展示了如何使用级联评估器评估数学问题。
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## 简介
|
||||
|
||||
数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力,我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
|
||||
数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力,我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHVerifyEvaluator 组件提供了一种便捷的数学推理评测方式。
|
||||
|
||||
## 数据集格式
|
||||
|
||||
@ -61,7 +61,7 @@ math_infer_cfg = dict(
|
||||
|
||||
```python
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator),
|
||||
evaluator=dict(type=MATHVerifyEvaluator),
|
||||
)
|
||||
```
|
||||
|
||||
@ -86,11 +86,11 @@ math_datasets = [
|
||||
]
|
||||
```
|
||||
|
||||
## MATHEvaluator
|
||||
## MATHVerifyEvaluator
|
||||
|
||||
MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发,该库提供了数学表达式解析和验证功能,支持 LaTeX 和一般表达式的提取与等价性验证。
|
||||
MATHVerifyEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发,该库提供了数学表达式解析和验证功能,支持 LaTeX 和一般表达式的提取与等价性验证。
|
||||
|
||||
MATHEvaluator 具有以下功能:
|
||||
MATHVerifyEvaluator 具有以下功能:
|
||||
|
||||
1. 使用 LaTeX 提取器从预测和参考答案中提取答案
|
||||
2. 处理各种 LaTeX 格式和环境
|
||||
@ -133,7 +133,7 @@ MATHEvaluator 具有以下功能:
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
|
||||
from opencompass.evaluator import MATHVerifyEvaluator
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
@ -160,7 +160,7 @@ math_infer_cfg = dict(
|
||||
|
||||
# 评测配置
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator),
|
||||
evaluator=dict(type=MATHVerifyEvaluator),
|
||||
)
|
||||
|
||||
# 数据集配置
|
||||
|
@ -117,6 +117,10 @@ html_js_files = [
|
||||
'js/custom.js'
|
||||
]
|
||||
|
||||
html_context = {
|
||||
'github_version': 'main',
|
||||
}
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
|
@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\
|
||||
|
||||
你可以使用排序和搜索功能找到需要的数据集。
|
||||
|
||||
我们对每一个数据集都给出了推荐的运行配置,部分数据集中还提供了基于LLM Judge的推荐配置。
|
||||
|
||||
你可以基于推荐配置快速启动评测。但请注意,推荐配置可能随时间推移被更新。
|
||||
|
||||
"""
|
||||
|
||||
with open('dataset_statistics.md', 'w') as f:
|
||||
@ -24,7 +28,13 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
|
||||
with open(load_path, 'r') as f2:
|
||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||
|
||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
||||
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
|
||||
|
||||
recommanded_dataset_list = [
|
||||
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
|
||||
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
|
||||
'mmlu_pro', 'musr', 'math500'
|
||||
]
|
||||
|
||||
|
||||
def table_format(data_list):
|
||||
@ -32,18 +42,31 @@ def table_format(data_list):
|
||||
for i in data_list:
|
||||
table_format_list_sub = []
|
||||
for j in i:
|
||||
if j in recommanded_dataset_list:
|
||||
link_token = '[链接]('
|
||||
else:
|
||||
link_token = '[链接(TBD)]('
|
||||
|
||||
for index in HEADER:
|
||||
if index == 'paper':
|
||||
table_format_list_sub.append('[链接](' + i[j][index] + ')')
|
||||
elif index == 'configpath_llmjudge':
|
||||
if i[j][index] == '':
|
||||
table_format_list_sub.append(i[j][index])
|
||||
else:
|
||||
table_format_list_sub.append(link_token +
|
||||
GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
elif index == 'configpath':
|
||||
if isinstance(i[j][index], list):
|
||||
sub_list_text = ''
|
||||
for k in i[j][index]:
|
||||
sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
|
||||
sub_list_text += (link_token + GITHUB_PREFIX + k +
|
||||
') / ')
|
||||
table_format_list_sub.append(sub_list_text[:-2])
|
||||
else:
|
||||
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
|
||||
table_format_list_sub.append(link_token +
|
||||
GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
else:
|
||||
table_format_list_sub.append(i[j][index])
|
||||
@ -60,7 +83,7 @@ def generate_table(data_list, title=None):
|
||||
if title is not None:
|
||||
f.write(f'\n{title}')
|
||||
f.write("""\n```{table}\n:class: dataset\n""")
|
||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
|
||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
|
||||
table_cfg = dict(tablefmt='pipe',
|
||||
floatfmt='.2f',
|
||||
numalign='right',
|
||||
|
@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
|
||||
- `-w`: 指定工作路径,默认为 `./outputs/default`
|
||||
- `-l`: 打开飞书机器人状态上报。
|
||||
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
|
||||
- `--dump-eval-details`: 开启时,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
|
||||
- `--dump-eval-details`: 默认开启,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。如不需要开启,需设置`--dump-eval-details False`。
|
||||
|
||||
以运行模式 `-m all` 为例,整体运行流如下:
|
||||
|
||||
|
20
examples/eval_benchmax.py
Normal file
20
examples/eval_benchmax.py
Normal file
@ -0,0 +1,20 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.xlivecodebench.xlivecodebench_gen import \
|
||||
LCB_datasets
|
||||
from opencompass.configs.datasets.xgpqa.xgpqa_gen import \
|
||||
gpqa_datasets
|
||||
from opencompass.configs.datasets.xIFEval.xIFeval_gen import \
|
||||
xifeval_datasets
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
|
||||
models as hf_llama3_8b_instruct_models
|
||||
|
||||
datasets = [
|
||||
*LCB_datasets,
|
||||
*gpqa_datasets,
|
||||
*xifeval_datasets
|
||||
]
|
||||
models = [
|
||||
*hf_llama3_8b_instruct_models
|
||||
]
|
130
examples/eval_cascade_evaluator.py
Normal file
130
examples/eval_cascade_evaluator.py
Normal file
@ -0,0 +1,130 @@
|
||||
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import (
|
||||
GenericLLMEvaluator,
|
||||
CascadeEvaluator,
|
||||
MATHVerifyEvaluator,
|
||||
)
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import (
|
||||
MATHDataset,
|
||||
math_postprocess_v2,
|
||||
normalize_final_answer,
|
||||
)
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
|
||||
with read_base():
|
||||
# Datasets, Summarizer
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
||||
models as lmdeploy_qwen2_5_7b_instruct_model,
|
||||
)
|
||||
|
||||
reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
########################## Evaluator #################################
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
llm_judge_evaluator = dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MATHDataset,
|
||||
path='opencompass/math',
|
||||
file_name='test_prm800k_500.json',
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
)
|
||||
|
||||
rule_evaluator =dict(type=MATHVerifyEvaluator)
|
||||
cascade_evaluator = dict(type=CascadeEvaluator,
|
||||
llm_evaluator=llm_judge_evaluator,
|
||||
rule_evaluator=rule_evaluator,
|
||||
parallel=False
|
||||
)
|
||||
########################## #################################
|
||||
eval_cfg = dict()
|
||||
|
||||
# eval_cfg['evaluator'] = rule_evaluator
|
||||
# eval_cfg['evaluator'] = llm_judge_evaluator
|
||||
eval_cfg['evaluator'] = cascade_evaluator
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
abbr='math_prm800k_500',
|
||||
type=MATHDataset,
|
||||
path='opencompass/math',
|
||||
file_name='test_prm800k_500.json',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
datasets = math_datasets
|
||||
models = lmdeploy_qwen2_5_7b_instruct_model
|
||||
|
||||
|
||||
work_dir = 'math_prm800k_500_cascade_evaluator'
|
155
examples/eval_codebench_full.py
Normal file
155
examples/eval_codebench_full.py
Normal file
@ -0,0 +1,155 @@
|
||||
# This config is used to test all the code benchmarks
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.runners import LocalRunner, VOLCRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
# bigcodebench
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
|
||||
bigcodebench_full_instruct_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
|
||||
bigcodebench_hard_instruct_datasets
|
||||
)
|
||||
# livecodebench code generation lite v5
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
|
||||
LCB_datasets
|
||||
)
|
||||
# huamneval series
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
|
||||
humaneval_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
|
||||
humanevalpro_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
|
||||
humanevalx_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
|
||||
humaneval_plus_datasets
|
||||
)
|
||||
# mbpp series
|
||||
from opencompass.configs.datasets.mbpp.mbpp_gen import (
|
||||
mbpp_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
|
||||
mbpppro_datasets
|
||||
)
|
||||
# multipl-e
|
||||
from opencompass.configs.datasets.multipl_e.multiple_gen import (
|
||||
multiple_datasets
|
||||
)
|
||||
# ds1000
|
||||
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
|
||||
ds1000_datasets
|
||||
)
|
||||
|
||||
# Models Part
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
||||
models as lmdeploy_qwen2_5_7b_instruct_model,
|
||||
)
|
||||
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.ds1000 import (
|
||||
ds1000_summary_groups,
|
||||
)
|
||||
from opencompass.configs.summarizers.groups.multipl_e import (
|
||||
multiple_summary_groups,
|
||||
)
|
||||
from opencompass.configs.summarizers.groups.humanevalx import (
|
||||
humanevalx_summary_groups,
|
||||
)
|
||||
|
||||
# models config
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
for model in models:
|
||||
model['max_seq_len'] = 16384
|
||||
model['max_out_len'] = 8192
|
||||
|
||||
# datasets config
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[],
|
||||
)
|
||||
|
||||
for item in humanevalx_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'
|
||||
] = 'codeeval.opencompass.org.cn/humanevalx'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
for item in ds1000_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'
|
||||
] = 'codeeval.opencompass.org.cn/ds1000'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
|
||||
|
||||
for dataset in datasets:
|
||||
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
|
||||
|
||||
|
||||
# summary
|
||||
summary_groups = sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
||||
)
|
||||
summary_groups.append(
|
||||
{'name': 'humanevalx',
|
||||
'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
|
||||
)
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
['bigcodebench_hard_instruct', 'pass@1'],
|
||||
['bigcodebench_full_instruct', 'pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['mbpp', 'score'],
|
||||
['humaneval_pro', 'pass@1'],
|
||||
['mbpp_pro', 'pass@1'],
|
||||
['humaneval_plus', 'humaneval_plus_pass@1'],
|
||||
['multiple', 'naive_average'],
|
||||
['humanevalx', 'naive_average'],
|
||||
['ds1000', 'naive_average'],
|
||||
'',
|
||||
'humanevalx-python',
|
||||
'humanevalx-cpp',
|
||||
'humanevalx-java',
|
||||
'humanevalx-js',
|
||||
'',
|
||||
'ds1000_Pandas',
|
||||
'ds1000_Numpy',
|
||||
'ds1000_Tensorflow',
|
||||
'ds1000_Scipy',
|
||||
'ds1000_Sklearn',
|
||||
'ds1000_Pytorch',
|
||||
'ds1000_Matplotlib',
|
||||
'',
|
||||
'humaneval-multiple-cpp',
|
||||
'humaneval-multiple-cs',
|
||||
'humaneval-multiple-go',
|
||||
'humaneval-multiple-java',
|
||||
'humaneval-multiple-rb',
|
||||
'humaneval-multiple-js',
|
||||
'humaneval-multiple-php',
|
||||
'humaneval-multiple-r',
|
||||
'humaneval-multiple-rs',
|
||||
'humaneval-multiple-sh',
|
||||
'',
|
||||
'mbpp-multiple-cpp',
|
||||
'mbpp-multiple-cs',
|
||||
'mbpp-multiple-go',
|
||||
'mbpp-multiple-java',
|
||||
'mbpp-multiple-rb',
|
||||
'mbpp-multiple-js',
|
||||
'mbpp-multiple-php',
|
||||
'mbpp-multiple-r',
|
||||
'mbpp-multiple-rs',
|
||||
'mbpp-multiple-sh'
|
||||
],
|
||||
summary_groups=summary_groups,
|
||||
)
|
||||
|
||||
work_dir = 'outputs/code'
|
@ -1,7 +1,7 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.dingo.dingo_gen import datasets
|
||||
from .models.hf_internlm.hf_internlm_7b import models
|
||||
from opencompass.configs.datasets.dingo.dingo_gen import datasets
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
|
||||
|
||||
work_dir = './outputs/eval_dingo'
|
||||
|
61
examples/eval_judge_dataset_all.py
Normal file
61
examples/eval_judge_dataset_all.py
Normal file
@ -0,0 +1,61 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
|
||||
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
|
||||
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
|
||||
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
|
||||
|
||||
from opencompass.configs.summarizers.judgedataset_all import summarizer
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen-7b-hf',
|
||||
path='Qwen/Qwen-7B',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=16384,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=72,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
work_dir = './outputs/judge_dataset_all/'
|
52
examples/eval_judgebench.py
Normal file
52
examples/eval_judgebench.py
Normal file
@ -0,0 +1,52 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
datasets = [*get_judgebench_datasets]
|
||||
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen-7b-hf',
|
||||
path='Qwen/Qwen-7B',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=16384,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=72,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
work_dir = './outputs/judgebench/'
|
53
examples/eval_judgerbenchv2.py
Normal file
53
examples/eval_judgerbenchv2.py
Normal file
@ -0,0 +1,53 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
|
||||
from opencompass.configs.summarizers.judgerbenchv2 import summarizer
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
datasets = [*get_judgerbenchv2_dataset]
|
||||
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen-7b-hf',
|
||||
path='Qwen/Qwen-7B',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=16384,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
infer = dict(
|
||||
# partitioner=dict(type=NaivePartitioner),
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=72,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
work_dir = './outputs/judgerbenchv2/'
|
142
examples/eval_qwen3.py
Normal file
142
examples/eval_qwen3.py
Normal file
@ -0,0 +1,142 @@
|
||||
|
||||
import os.path as osp
|
||||
from opencompass.models import OpenAISDK
|
||||
from mmengine.config import read_base
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
|
||||
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
|
||||
from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Meta Info #
|
||||
#######################################################################
|
||||
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
judge_cfg = dict(
|
||||
abbr='qwen2-5-32B-Instruct',
|
||||
type=OpenAISDK,
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
key='sk-1234',
|
||||
openai_api_base=[
|
||||
'http://x.x.x.x:4000/v1',
|
||||
],
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=8,
|
||||
batch_size=256,
|
||||
temperature=0.001,
|
||||
# max_completion_tokens=32768,
|
||||
tokenizer_path='gpt-4o-2024-05-13',
|
||||
# verbose=True,
|
||||
max_out_len=16384,
|
||||
max_seq_len=32768,
|
||||
# max_seq_len=49152,
|
||||
mode='mid',
|
||||
retry=10
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
|
||||
repeated_info = [
|
||||
(math_datasets, 4),
|
||||
(aime2024_datasets, 32),
|
||||
(aime2025_datasets, 32),
|
||||
]
|
||||
|
||||
for datasets_, num in repeated_info:
|
||||
for dataset_ in datasets_:
|
||||
dataset_['n'] = num
|
||||
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[],
|
||||
)
|
||||
|
||||
for item in datasets:
|
||||
item['infer_cfg']['inferencer']['max_out_len'] = 32768
|
||||
try:
|
||||
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
||||
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
||||
elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
|
||||
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
|
||||
except:
|
||||
pass
|
||||
#######################################################################
|
||||
# PART 2 Dataset Summarizer #
|
||||
#######################################################################
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'MATH',
|
||||
['math_prm800k_500', 'accuracy (4 runs average)'],
|
||||
['aime2024', 'accuracy (32 runs average)'],
|
||||
['aime2025', 'accuracy (32 runs average)'],
|
||||
['livemathbench_hard', 'naive_average'],
|
||||
['OlympiadBenchMath', 'accuracy'],
|
||||
['olymmath', 'naive_average'],
|
||||
],
|
||||
summary_groups = sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
||||
),
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
models += [
|
||||
|
||||
dict(
|
||||
abbr='Qwen_Qwen3-235B-A22B',
|
||||
type=OpenAISDK,
|
||||
path='Qwen/Qwen3-235B-A22B',
|
||||
key='sk-admin',
|
||||
openai_api_base=[
|
||||
'http://106.15.231.215:40007/v1/',
|
||||
],
|
||||
meta_template=dict(
|
||||
# begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
# XXX: all system roles are mapped to human in purpose
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
),
|
||||
query_per_second=16,
|
||||
batch_size=128,
|
||||
# batch_size=1,
|
||||
temperature=0.6,
|
||||
# max_completion_tokens=32768,
|
||||
tokenizer_path='gpt-4',
|
||||
# verbose=True,
|
||||
max_out_len=32768,
|
||||
max_seq_len=32768,
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
),
|
||||
]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=8),
|
||||
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
|
||||
)
|
||||
|
||||
base_exp_dir = 'outputs/qwen3_reasoning'
|
||||
work_dir = osp.join(base_exp_dir, 'chat_objective')
|
53
examples/eval_rewardbench.py
Normal file
53
examples/eval_rewardbench.py
Normal file
@ -0,0 +1,53 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
|
||||
from opencompass.configs.summarizers.rewardbench import summarizer
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
datasets = [*get_rewardbench_datasets]
|
||||
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen-7b-hf',
|
||||
path='Qwen/Qwen-7B',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=16384,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=72,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
work_dir = './outputs/rewardbench/'
|
53
examples/eval_rmb.py
Normal file
53
examples/eval_rmb.py
Normal file
@ -0,0 +1,53 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
datasets = [*get_rmb_dataset]
|
||||
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen-7b-hf',
|
||||
path='Qwen/Qwen-7B',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=16384,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
infer = dict(
|
||||
# partitioner=dict(type=NaivePartitioner),
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=72,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
work_dir = './outputs/rmb/'
|
@ -50,8 +50,9 @@ for m in _origin_models:
|
||||
|
||||
datasets = teval_en_datasets + teval_zh_datasets
|
||||
work_dir = './outputs/teval'
|
||||
'''
|
||||
dataset version metric mode qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf
|
||||
"""Dataset version metric mode
|
||||
qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf.
|
||||
|
||||
------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
|
||||
teval - naive_average unknown 57.69 78.18 36.63
|
||||
teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
|
||||
@ -77,4 +78,4 @@ teval-reason_retrieve_understand_json_v1_zh 10482d name unknown
|
||||
teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
|
||||
teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
|
||||
teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
|
||||
'''
|
||||
"""
|
||||
|
@ -1 +1 @@
|
||||
__version__ = '0.4.1'
|
||||
__version__ = '0.4.2'
|
||||
|
@ -12,8 +12,8 @@ from mmengine.config import Config, DictAction
|
||||
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
|
||||
from opencompass.runners import SlurmRunner
|
||||
from opencompass.summarizers import DefaultSummarizer
|
||||
from opencompass.utils import (LarkReporter, get_logger, read_from_station,
|
||||
save_to_station)
|
||||
from opencompass.utils import (LarkReporter, get_logger, pretty_print_config,
|
||||
read_from_station, save_to_station)
|
||||
from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
|
||||
get_config_from_arg)
|
||||
|
||||
@ -94,6 +94,11 @@ def parse_args():
|
||||
help='Use the custom config directory instead of config/ to '
|
||||
'search the configs for datasets, models and summarizers',
|
||||
type=str)
|
||||
parser.add_argument(
|
||||
'--config-verbose',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Whether to print the config in verbose mode.')
|
||||
parser.add_argument('-l',
|
||||
'--lark',
|
||||
help='Report the running status to lark bot',
|
||||
@ -119,8 +124,11 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
'--dump-eval-details',
|
||||
help='Whether to dump the evaluation details, including the '
|
||||
'correctness of each sample, bpb, etc.',
|
||||
action='store_true',
|
||||
'correctness of each sample, bpb, etc. Defaults to True.',
|
||||
nargs='?',
|
||||
const=True,
|
||||
default=True,
|
||||
type=lambda x: False if x and x.lower() == 'false' else True
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dump-extract-rate',
|
||||
@ -128,7 +136,7 @@ def parse_args():
|
||||
'correctness of each sample, bpb, etc.',
|
||||
action='store_true',
|
||||
)
|
||||
|
||||
# for the results persistence
|
||||
parser.add_argument('-sp',
|
||||
'--station-path',
|
||||
help='Path to your results station.',
|
||||
@ -147,7 +155,12 @@ def parse_args():
|
||||
'data station.',
|
||||
action='store_true',
|
||||
)
|
||||
|
||||
# for evaluation with multiple runs
|
||||
parser.add_argument('--dataset-num-runs',
|
||||
help='How many runs for one dataset',
|
||||
type=int,
|
||||
default=1,
|
||||
)
|
||||
|
||||
# set srun args
|
||||
slurm_parser = parser.add_argument_group('slurm_args')
|
||||
@ -233,7 +246,6 @@ def parse_custom_dataset_args(custom_dataset_parser):
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.num_gpus is not None:
|
||||
raise ValueError('The `--num-gpus` argument is deprecated, please use '
|
||||
'`--hf-num-gpus` to describe number of gpus used for '
|
||||
@ -297,6 +309,11 @@ def main():
|
||||
content = f'{getpass.getuser()}\'s task has been launched!'
|
||||
LarkReporter(cfg['lark_bot_url']).post(content)
|
||||
|
||||
|
||||
# print config if specified --config-verbose
|
||||
if args.config_verbose:
|
||||
pretty_print_config(cfg)
|
||||
|
||||
# infer
|
||||
if args.mode in ['all', 'infer']:
|
||||
# When user have specified --slurm or --dlc, or have not set
|
||||
@ -350,6 +367,9 @@ def main():
|
||||
if args.dlc or args.slurm or cfg.get('eval', None) is None:
|
||||
fill_eval_cfg(cfg, args)
|
||||
if args.dump_eval_details:
|
||||
logger.warning('Default to dump eval details, it might take extra'
|
||||
'space to save all the evaluation details. '
|
||||
'Set --dump-eval-details False to skip the details dump')
|
||||
cfg.eval.runner.task.dump_details = True
|
||||
if args.dump_extract_rate:
|
||||
cfg.eval.runner.task.cal_extract_rate = True
|
||||
|
@ -0,0 +1,101 @@
|
||||
from opencompass.datasets import CARDBiomedBenchDataset
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
ZERO_SHOT_PROMPT = 'You are an expert in {expert}.\n{question}\n'
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: Q: You are an expert in {expert}.\n{question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question',
|
||||
'answer',
|
||||
'Bio_Category',
|
||||
'SQL_Category',
|
||||
'uuid',
|
||||
'template uuid',
|
||||
'expert',
|
||||
],
|
||||
output_column='answer',
|
||||
)
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=CARDBiomedBenchDataset,
|
||||
path='NIH-CARD/CARDBiomedBench',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
cardbiomedbench_dataset = dict(
|
||||
type=CARDBiomedBenchDataset,
|
||||
abbr='cardbiomedbench',
|
||||
path='NIH-CARD/CARDBiomedBench',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
cardbiomedbench_datasets = [cardbiomedbench_dataset]
|
@ -1,77 +1,4 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import ChemBenchDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
||||
from mmengine.config import read_base
|
||||
|
||||
|
||||
chembench_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
chembench_all_sets = [
|
||||
'Name_Conversion',
|
||||
'Property_Prediction',
|
||||
'Mol2caption',
|
||||
'Caption2mol',
|
||||
'Product_Prediction',
|
||||
'Retrosynthesis',
|
||||
'Yield_Prediction',
|
||||
'Temperature_Prediction',
|
||||
'Solvent_Prediction'
|
||||
]
|
||||
|
||||
|
||||
chembench_datasets = []
|
||||
for _name in chembench_all_sets:
|
||||
# _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
||||
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
|
||||
|
||||
chembench_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
chembench_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess))
|
||||
|
||||
chembench_datasets.append(
|
||||
dict(
|
||||
abbr=f'ChemBench_{_name}',
|
||||
type=ChemBenchDataset,
|
||||
path='opencompass/ChemBench',
|
||||
name=_name,
|
||||
reader_cfg=chembench_reader_cfg,
|
||||
infer_cfg=chembench_infer_cfg,
|
||||
eval_cfg=chembench_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
||||
with read_base():
|
||||
from .ChemBench_gen_a9f753 import chembench_datasets # noqa: F401, F403
|
@ -0,0 +1,77 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import ChemBenchDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
||||
|
||||
|
||||
chembench_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
chembench_all_sets = [
|
||||
'Name_Conversion',
|
||||
'Property_Prediction',
|
||||
'Mol2caption',
|
||||
'Caption2mol',
|
||||
'Product_Prediction',
|
||||
'Retrosynthesis',
|
||||
'Yield_Prediction',
|
||||
'Temperature_Prediction',
|
||||
'Solvent_Prediction'
|
||||
]
|
||||
|
||||
|
||||
chembench_datasets = []
|
||||
for _name in chembench_all_sets:
|
||||
# _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
||||
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
|
||||
|
||||
chembench_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
chembench_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess))
|
||||
|
||||
chembench_datasets.append(
|
||||
dict(
|
||||
abbr=f'ChemBench_{_name}',
|
||||
type=ChemBenchDataset,
|
||||
path='opencompass/ChemBench4K',
|
||||
name=_name,
|
||||
reader_cfg=chembench_reader_cfg,
|
||||
infer_cfg=chembench_infer_cfg,
|
||||
eval_cfg=chembench_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .ChemBench_llmjudge_gen_c584cf import chembench_datasets # noqa: F401, F403
|
@ -0,0 +1,108 @@
|
||||
from opencompass.datasets.math import MATHDataset
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import ChemBenchDataset
|
||||
|
||||
|
||||
chembench_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
chembench_all_sets = [
|
||||
'Name_Conversion',
|
||||
'Property_Prediction',
|
||||
'Mol2caption',
|
||||
'Caption2mol',
|
||||
'Product_Prediction',
|
||||
'Retrosynthesis',
|
||||
'Yield_Prediction',
|
||||
'Temperature_Prediction',
|
||||
'Solvent_Prediction'
|
||||
]
|
||||
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
|
||||
|
||||
chembench_datasets = []
|
||||
for _name in chembench_all_sets:
|
||||
chembench_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ')
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer)
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
chembench_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ChemBenchDataset,
|
||||
path='opencompass/ChemBench4K',
|
||||
name=_name,
|
||||
reader_cfg=chembench_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
chembench_datasets.append(
|
||||
dict(
|
||||
abbr=f'ChemBench_{_name}',
|
||||
type=ChemBenchDataset,
|
||||
path='opencompass/ChemBench4K',
|
||||
name=_name,
|
||||
reader_cfg=chembench_reader_cfg,
|
||||
infer_cfg=chembench_infer_cfg,
|
||||
eval_cfg=chembench_eval_cfg,
|
||||
))
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .ClimaQA_Gold_llm_judge_gen_f15343 import climaqa_datasets # noqa: F401, F403
|
@ -0,0 +1,164 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import ClimaQADataset, generic_llmjudge_postprocess
|
||||
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
climaqa_gold_sets = [
|
||||
'mcq',
|
||||
'cloze',
|
||||
'ffq'
|
||||
]
|
||||
|
||||
GRADER_TEMPLATE_mcq = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. The answer may be one of the four options: a, b, c, or d. Only when the options given by prediction are strictly consistent with the answer, the prediction can be considered correct.
|
||||
3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:', and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
GRADER_TEMPLATE_cloze = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. The form of the answer is a word or a phrase. Please strictly compare the prediction and the answer. Only when the prediction and the answer are exactly the same, will the prediction be considered correct; otherwise, it will be considered incorrect.
|
||||
3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
GRADER_TEMPLATE_ffq = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. The type of question is open-ended Q&A. Please compare whether the prediction is close enough to the meaning of the answer and whether the prediction covers each key point in the answer. If the prediction meets the above requirements, it can be considered very close to the answer.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is very close to the standard answer.
|
||||
|
||||
Please judge whether the following answers are close to the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: very close to the answer
|
||||
B: not very close to the answer
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either A or B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
climaqa_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
climaqa_datasets = []
|
||||
|
||||
for _task in climaqa_gold_sets:
|
||||
|
||||
if _task == 'mcq':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_mcq
|
||||
infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification. The question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: a\"\n\nQ: {{input}}\nA: "
|
||||
if _task == 'ffq':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_ffq
|
||||
infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\".\n\nQ: {{input}}\nA: "
|
||||
if _task == 'cloze':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_cloze
|
||||
infer_prompt = f"Fill the <Mask> in the sentence. Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\".\n\nQ: {{input}}\nA: "
|
||||
|
||||
climaqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=infer_prompt,
|
||||
)
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
climaqa_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ClimaQADataset,
|
||||
path='opencompass/ClimaQA-Gold',
|
||||
task=_task,
|
||||
abbr='ClimaQA_Gold_' + _task,
|
||||
reader_cfg=climaqa_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
climaqa_datasets.append(
|
||||
dict(
|
||||
abbr='ClimaQA_Gold_' + _task,
|
||||
type=ClimaQADataset,
|
||||
path='opencompass/ClimaQA-Gold',
|
||||
task=_task,
|
||||
reader_cfg=climaqa_reader_cfg,
|
||||
infer_cfg=climaqa_infer_cfg,
|
||||
eval_cfg=climaqa_eval_cfg,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .ClimaQA_Silver_llm_judge_gen_f15343 import climaqa_datasets # noqa: F401, F403
|
@ -0,0 +1,160 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import ClimaQADataset, generic_llmjudge_postprocess
|
||||
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
climaqa_silver_sets = [
|
||||
'mcq',
|
||||
'cloze',
|
||||
'ffq'
|
||||
]
|
||||
|
||||
GRADER_TEMPLATE_mcq = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. The answer may be one of the four options: a, b, c, or d. Only when the options given by prediction are strictly consistent with the answer, the prediction can be considered correct.
|
||||
3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:', and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE_cloze = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. The form of the answer is a word or a phrase. Please strictly compare the prediction and the answer. Only when the prediction and the answer are exactly the same, will the prediction be considered correct; otherwise, it will be considered incorrect.
|
||||
3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE_ffq = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. The type of question is open-ended Q&A. Please compare whether the prediction is close enough to the meaning of the answer and whether the prediction covers each key point in the answer. If the prediction meets the above requirements, it can be considered very close to the answer.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is very close to the standard answer.
|
||||
|
||||
Please judge whether the following answers are close to the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: very close to the answer
|
||||
B: not very close to the answer
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either A or B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
climaqa_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
climaqa_datasets = []
|
||||
|
||||
for _task in climaqa_silver_sets:
|
||||
|
||||
if _task == 'mcq':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_mcq
|
||||
infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification. The question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: a\"\n\nQ: {{input}}\nA: "
|
||||
if _task == 'ffq':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_ffq
|
||||
infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\".\n\nQ: {{input}}\nA: "
|
||||
if _task == 'cloze':
|
||||
GRADER_TEMPLATE = GRADER_TEMPLATE_cloze
|
||||
infer_prompt = f"Fill the <Mask> in the sentence. Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\".\n\nQ: {{input}}\nA: "
|
||||
|
||||
climaqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=infer_prompt,
|
||||
)
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
climaqa_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ClimaQADataset,
|
||||
path='opencompass/ClimaQA-Silver',
|
||||
task=_task,
|
||||
abbr='ClimaQA_Silver_' + _task,
|
||||
reader_cfg=climaqa_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
climaqa_datasets.append(
|
||||
dict(
|
||||
abbr='ClimaQA_Silver_' + _task,
|
||||
type=ClimaQADataset,
|
||||
path='opencompass/ClimaQA-Silver',
|
||||
task=_task,
|
||||
reader_cfg=climaqa_reader_cfg,
|
||||
infer_cfg=climaqa_infer_cfg,
|
||||
eval_cfg=climaqa_eval_cfg,
|
||||
)
|
||||
)
|
||||
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .ClinicBench_llmjudge_gen_d09668 import ClinicBench_datasets
|
@ -0,0 +1,100 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets.ClinicBench import ClinicBenchDataset
|
||||
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
|
||||
Question:\n
|
||||
{question}
|
||||
|
||||
Options:\n
|
||||
{choices}
|
||||
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
ClinicBench_datasets = []
|
||||
|
||||
ClinicBench_reader_cfg = dict(
|
||||
input_columns=['question', 'choices'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
ClinicBench_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
ClinicBench_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ClinicBenchDataset,
|
||||
path='xuxuxuxuxu/Pharmacology-QA',
|
||||
reader_cfg=ClinicBench_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
ClinicBench_datasets.append(
|
||||
dict(
|
||||
abbr=f'ClinicBench',
|
||||
type=ClinicBenchDataset,
|
||||
path='xuxuxuxuxu/Pharmacology-QA',
|
||||
reader_cfg=ClinicBench_reader_cfg,
|
||||
infer_cfg=ClinicBench_infer_cfg,
|
||||
eval_cfg=ClinicBench_eval_cfg,
|
||||
)
|
||||
)
|
@ -1,31 +1,27 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.utils import xml_tag_postprocessor
|
||||
from opencompass.datasets import HLEDataset
|
||||
|
||||
aime2024_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
)
|
||||
# ----------------------------- Detailed Config -----------------------------
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
|
||||
],
|
||||
)
|
||||
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048)
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
@ -35,23 +31,20 @@ GRADER_TEMPLATE = """
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
# Evaluation configuration
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
@ -71,25 +64,25 @@ aime2024_eval_cfg = dict(
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
type=HLEDataset,
|
||||
path='cais/hle',
|
||||
reader_cfg=math_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
|
||||
hle_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
mode='singlescore',
|
||||
type=HLEDataset,
|
||||
abbr='hle_llmjudge',
|
||||
path='cais/hle',
|
||||
category='Biology/Medicine',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
)
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, F403
|
||||
from .IFEval_gen_353ae7 import ifeval_datasets # noqa: F401, F403
|
@ -0,0 +1,57 @@
|
||||
from opencompass.datasets import MedCalc_BenchDataset, MedCalcOfficial_Evaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
ZERO_SHOT_PROMPT = 'You are a helpful assistant for calculating a score for a given patient note. Please think step-by-step to solve the question and then generate the required score. Your output should only contain a JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}. \n Here is the patient note:\n{patient_note}\n\nHere is the task:\n{question}\n\nPlease directly output the JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}:'
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'row_number',
|
||||
'calculator_id',
|
||||
'calculator_name',
|
||||
'category',
|
||||
'note_id',
|
||||
'output_type',
|
||||
'note_type',
|
||||
'patient_note',
|
||||
'question',
|
||||
'relevant_entities',
|
||||
'ground_truth_answer',
|
||||
'lower_limit',
|
||||
'upper_limit',
|
||||
'ground_truth_explanation'
|
||||
],
|
||||
output_column='ground_truth_answer',
|
||||
)
|
||||
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN',prompt=ZERO_SHOT_PROMPT),
|
||||
])
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=MedCalcOfficial_Evaluator),
|
||||
pred_role='BOT',
|
||||
)
|
||||
medcal_bench_dataset = dict(
|
||||
type=MedCalc_BenchDataset,
|
||||
abbr='medcal_bench_official_zero_shot_eval',
|
||||
path='ncbi/MedCalc-Bench-v1.0',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
medcal_bench_datasets = [medcal_bench_dataset]
|
63
opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
Normal file
63
opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
Normal file
@ -0,0 +1,63 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
from opencompass.datasets.MedQA import MedQADataset
|
||||
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
|
||||
Question:\n
|
||||
{question}
|
||||
|
||||
Options:\n
|
||||
{choices}
|
||||
|
||||
""".strip()
|
||||
|
||||
|
||||
MedQA_datasets = []
|
||||
|
||||
MedQA_reader_cfg = dict(
|
||||
input_columns=['question', 'choices'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
MedQA_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
MedQA_subsets = {
|
||||
'US': 'xuxuxuxuxu/MedQA_US_test',
|
||||
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
|
||||
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
|
||||
}
|
||||
|
||||
for split in list(MedQA_subsets.keys()):
|
||||
|
||||
MedQA_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
|
||||
)
|
||||
|
||||
MedQA_datasets.append(
|
||||
dict(
|
||||
abbr=f'MedQA_{split}',
|
||||
type=MedQADataset,
|
||||
path=MedQA_subsets[split],
|
||||
reader_cfg=MedQA_reader_cfg,
|
||||
infer_cfg=MedQA_infer_cfg,
|
||||
eval_cfg=MedQA_eval_cfg,
|
||||
)
|
||||
)
|
108
opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
Normal file
108
opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
Normal file
@ -0,0 +1,108 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets.MedQA import MedQADataset
|
||||
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
|
||||
Question:\n
|
||||
{question}
|
||||
|
||||
Options:\n
|
||||
{choices}
|
||||
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
MedQA_datasets = []
|
||||
|
||||
MedQA_reader_cfg = dict(
|
||||
input_columns=['question', 'choices'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
MedQA_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
MedQA_subsets = {
|
||||
'US': 'xuxuxuxuxu/MedQA_US_test',
|
||||
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
|
||||
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
|
||||
}
|
||||
|
||||
for split in list(MedQA_subsets.keys()):
|
||||
|
||||
MedQA_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MedQADataset,
|
||||
path=MedQA_subsets[split],
|
||||
reader_cfg=MedQA_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
MedQA_datasets.append(
|
||||
dict(
|
||||
abbr=f'MedQA_{split}',
|
||||
type=MedQADataset,
|
||||
path=MedQA_subsets[split],
|
||||
reader_cfg=MedQA_reader_cfg,
|
||||
infer_cfg=MedQA_infer_cfg,
|
||||
eval_cfg=MedQA_eval_cfg,
|
||||
)
|
||||
)
|
57
opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
Normal file
57
opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
Normal file
@ -0,0 +1,57 @@
|
||||
from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
|
||||
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question',
|
||||
'options',
|
||||
'medical_task',
|
||||
'body_system',
|
||||
'question_type',
|
||||
'prompt_mode',
|
||||
],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=MedXpertQAEvaluator),
|
||||
pred_role='BOT',
|
||||
)
|
||||
medxpertqa_dataset = dict(
|
||||
type=MedXpertQADataset,
|
||||
abbr='medxpertqa',
|
||||
path='TsinghuaC3I/MedXpertQA',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
medxpertqa_datasets = [medxpertqa_dataset]
|
@ -0,0 +1,104 @@
|
||||
from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
|
||||
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question',
|
||||
'options',
|
||||
'medical_task',
|
||||
'body_system',
|
||||
'question_type',
|
||||
'prompt_mode',
|
||||
],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MedXpertQADataset,
|
||||
path='TsinghuaC3I/MedXpertQA',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
medxpertqa_dataset = dict(
|
||||
type=MedXpertQADataset,
|
||||
abbr='medxpertqa',
|
||||
path='TsinghuaC3I/MedXpertQA',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
medxpertqa_datasets = [medxpertqa_dataset]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .medbullets_gen_60c8f5 import medbullets_datasets # noqa: F401, F403
|
@ -0,0 +1,59 @@
|
||||
from opencompass.datasets import MedbulletsDataset, MedbulletsEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
import os
|
||||
|
||||
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
|
||||
ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question',
|
||||
'options',
|
||||
'question_type',
|
||||
'prompt_mode',
|
||||
|
||||
],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=MedbulletsEvaluator),
|
||||
pred_role='BOT',
|
||||
)
|
||||
medbullets_dataset = dict(
|
||||
type=MedbulletsDataset,
|
||||
abbr='medbullets',
|
||||
path='opencompass/medbullets',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
|
||||
)
|
||||
|
||||
medbullets_datasets = [medbullets_dataset]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .medbullets_llmjudge_gen_60c8f5 import medbullets_datasets # noqa: F401, F403
|
@ -0,0 +1,106 @@
|
||||
from opencompass.datasets import MedbulletsDataset, medbullets_llmjudge_postprocess
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
import os
|
||||
|
||||
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
|
||||
ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question',
|
||||
'options',
|
||||
'question_type',
|
||||
'prompt_mode',
|
||||
|
||||
],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MedbulletsDataset,
|
||||
path='opencompass/medbullets',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=medbullets_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
medbullets_dataset = dict(
|
||||
type=MedbulletsDataset,
|
||||
abbr='medbullets',
|
||||
path='opencompass/medbullets',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
|
||||
)
|
||||
|
||||
medbullets_datasets = [medbullets_dataset]
|
60
opencompass/configs/datasets/OlymMATH/README.md
Normal file
60
opencompass/configs/datasets/OlymMATH/README.md
Normal file
@ -0,0 +1,60 @@
|
||||
# OlymMATH
|
||||
[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
|
||||
|
||||
Dataset OlymMATH, please refer to the paper:
|
||||
Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
|
||||
|
||||
|
||||
## How to eval OlymMATH with model judge
|
||||
This is a simple example:
|
||||
```python
|
||||
|
||||
from opencompass.models import OpenAISDK, OpenAI
|
||||
from mmengine.config import read_base
|
||||
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
|
||||
from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
|
||||
|
||||
################## Judge Config ##################
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
judge_cfg = dict(
|
||||
# An API model with OpenAI API format is required for Judge
|
||||
abbr='qwen2-5-32B-Instruct',
|
||||
type=OpenAISDK,
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
key='sk-1234',
|
||||
openai_api_base=[
|
||||
'http://172.30.56.1:4000/v1',
|
||||
],
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
batch_size=1024,
|
||||
temperature=0.001,
|
||||
max_completion_tokens=32768,
|
||||
tokenizer_path='gpt-4o-2024-05-13',
|
||||
verbose=True,
|
||||
max_out_len=16384,
|
||||
max_seq_len=32768,
|
||||
)
|
||||
|
||||
################## Model Config ##################
|
||||
models = [*qwen2_5_7b_instruct_model]
|
||||
|
||||
################## Dataset Config ##################
|
||||
datasets = [*olymmath_datasets]
|
||||
|
||||
# Set judge_cfg for evaluation
|
||||
for item in datasets:
|
||||
item['infer_cfg']['inferencer']['max_out_len'] = 32768
|
||||
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
||||
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
||||
|
||||
|
||||
work_dir = './outputs/olymmath_llm_eval'
|
||||
```
|
@ -0,0 +1,109 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import OlymMATHDataset
|
||||
from opencompass.evaluator import (
|
||||
CascadeEvaluator,
|
||||
GenericLLMEvaluator,
|
||||
MATHVerifyEvaluator
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------- Detailed Config -----------------------------
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# Evaluation configuration
|
||||
|
||||
olymmath_datasets = []
|
||||
|
||||
for sub_set in sub_sets:
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=CascadeEvaluator,
|
||||
rule_evaluator=dict(
|
||||
type=MATHVerifyEvaluator,
|
||||
),
|
||||
llm_evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=OlymMATHDataset,
|
||||
path='RUC-AIBOX/OlymMATH',
|
||||
reader_cfg=math_reader_cfg,
|
||||
subset=sub_set,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
parallel=False,
|
||||
),
|
||||
)
|
||||
olymmath_datasets.append(
|
||||
dict(
|
||||
type=OlymMATHDataset,
|
||||
abbr=f'olymmath_{sub_set}',
|
||||
path='RUC-AIBOX/OlymMATH',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
subset=sub_set,
|
||||
n=1
|
||||
)
|
||||
)
|
@ -0,0 +1,5 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
# Default use LLM as a judge
|
||||
from .olymmath_llmverify_gen_97b203 import olymmath_datasets # noqa: F401, F403
|
@ -0,0 +1,99 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import OlymMATHDataset
|
||||
|
||||
|
||||
# ----------------------------- Detailed Config -----------------------------
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# Evaluation configuration
|
||||
|
||||
olymmath_datasets = []
|
||||
|
||||
for sub_set in sub_sets:
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=OlymMATHDataset,
|
||||
path='RUC-AIBOX/OlymMATH',
|
||||
reader_cfg=math_reader_cfg,
|
||||
subset=sub_set,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
olymmath_datasets.append(
|
||||
dict(
|
||||
type=OlymMATHDataset,
|
||||
abbr=f'olymmath_llmjudge_{sub_set}',
|
||||
path='RUC-AIBOX/OlymMATH',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
subset=sub_set,
|
||||
)
|
||||
)
|
@ -0,0 +1,114 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.evaluator import (
|
||||
GenericLLMEvaluator,
|
||||
CascadeEvaluator,
|
||||
MATHVerifyEvaluator
|
||||
)
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
with read_base():
|
||||
from .OlympiadBench_categories import categories
|
||||
|
||||
# Create prompter instance for problems
|
||||
olympiadbench_prompter_cfg = dict(
|
||||
type='OlympiadBenchPrompter'
|
||||
)
|
||||
|
||||
olympiadbench_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'problem', 'language', 'subject', 'question_type',
|
||||
'answer_type', 'is_multiple_answer', 'unit', 'questions'
|
||||
],
|
||||
output_column='solution'
|
||||
)
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
olympiadbench_datasets = []
|
||||
for _name in categories:
|
||||
olympiadbench_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type='OlympiadBenchTemplate'
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
olympiadbench_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=CascadeEvaluator,
|
||||
rule_evaluator=dict(
|
||||
type=MATHVerifyEvaluator,
|
||||
),
|
||||
llm_evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=OlympiadBenchDataset,
|
||||
path='opencompass/OlympiadBench',
|
||||
name=_name,
|
||||
reader_cfg=olympiadbench_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
parallel=False
|
||||
)
|
||||
)
|
||||
|
||||
olympiadbench_datasets.append(
|
||||
dict(
|
||||
type=OlympiadBenchDataset,
|
||||
abbr=f'OlympiadBench_{_name}',
|
||||
path='opencompass/OlympiadBench',
|
||||
name=_name,
|
||||
reader_cfg=olympiadbench_reader_cfg,
|
||||
infer_cfg=olympiadbench_infer_cfg,
|
||||
eval_cfg=olympiadbench_eval_cfg,
|
||||
n=1,
|
||||
)
|
||||
)
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .PHYSICS_llm_judge_gen_a133a2 import physics_datasets # noqa: F401, F403
|
@ -0,0 +1,131 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
PHYSICSDataset,
|
||||
generic_llmjudge_postprocess,
|
||||
)
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
physics_sets = [
|
||||
'atomic_dataset_textonly',
|
||||
'electro_dataset_textonly',
|
||||
'mechanics_dataset_textonly',
|
||||
'optics_dataset_textonly',
|
||||
'quantum_dataset_textonly',
|
||||
'statistics_dataset_textonly',
|
||||
]
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some questions may include multiple sub questions and sub answers. Each sub answer is given after a guide character in the form of <Answer 1:> or <Answer 2:>, etc. Please note that only when all sub predictions given in prediction correspond one-to-one with the answer and are all correct, will the prediction be considered correct; otherwise, it will be considered incorrect.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. The final answers in the prediction are generally given with \\boxed{}. If you cannot find sufficient \\boxed{} in the prediction, please try to find matching answers from other places within the prediction as much as possible.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: All Sub Predictions Are Correct
|
||||
B: Not Every Sub Predictions is Correct
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either A, B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# GRADER_TEMPLATE = """
|
||||
# Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
#
|
||||
# Here are some evaluation criteria:
|
||||
# 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
# 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
# 3. Some questions may include multiple sub questions and sub answers. Each sub answer is given after a guide character in the form of <Answer 1:> or <Answer 2:>, etc. Please note that as long as at least one correct answer appears in the prediction, the prediction is considered correct.
|
||||
# 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
# 5. The final answers in the prediction are generally given with \\boxed{}. If you cannot find sufficient \\boxed{} in the prediction, please try to find matching answers from other places within the prediction as much as possible.
|
||||
#
|
||||
# Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
# A: At Least One Sub Prediction is Correct
|
||||
# B: All Sub Predictions are Incorrect
|
||||
# Just return the letters "A" or "B", with no text around it.
|
||||
#
|
||||
# Here is your task. Simply reply with either A, B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
#
|
||||
# <Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
# <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
# <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
#
|
||||
# Judging the correctness of candidates' answers:
|
||||
# """.strip()
|
||||
|
||||
physics_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
physics_datasets = []
|
||||
|
||||
for _name in physics_sets:
|
||||
|
||||
physics_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'Answer the given question step by step. Begin by explaining your reasoning process clearly. Conclude by providing the final answers at the end in LaTeX boxed format. Think step by step before answering. It should be noted that the question may include multiple sub questions, please ensure that each question is answered in order.\n\nQ: {{input}}\nA: ',
|
||||
)
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
physics_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=PHYSICSDataset,
|
||||
path='opencompass/PHYSICS-textonly',
|
||||
abbr='PHYSICS_' + _name,
|
||||
name=_name,
|
||||
reader_cfg=physics_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
physics_datasets.append(
|
||||
dict(
|
||||
abbr='PHYSICS_' + _name,
|
||||
type=PHYSICSDataset,
|
||||
path='opencompass/PHYSICS-textonly',
|
||||
name=_name,
|
||||
reader_cfg=physics_reader_cfg,
|
||||
infer_cfg=physics_infer_cfg,
|
||||
eval_cfg=physics_eval_cfg,
|
||||
)
|
||||
)
|
||||
|
@ -0,0 +1,46 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset, ProteinLMBenchEvaluator
|
||||
|
||||
QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
|
||||
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=['question', 'start', 'end', 'options'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=QUERY_TEMPLATE
|
||||
)
|
||||
], ),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=ProteinLMBenchEvaluator),
|
||||
)
|
||||
|
||||
proteinlmbench_dataset = dict(
|
||||
abbr='ProteinLMBench',
|
||||
type=ProteinLMBenchDataset,
|
||||
path='tsynbio/ProteinLMBench',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg
|
||||
)
|
||||
|
||||
proteinlmbench_datasets = [proteinlmbench_dataset]
|
@ -0,0 +1,89 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset
|
||||
|
||||
QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
reader_cfg = dict(
|
||||
input_columns=['question', 'start', 'end', 'options'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ProteinLMBenchDataset,
|
||||
path='tsynbio/ProteinLMBench',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
proteinlmbench_dataset = dict(
|
||||
abbr='ProteinLMBench',
|
||||
type=ProteinLMBenchDataset,
|
||||
path='tsynbio/ProteinLMBench',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg
|
||||
)
|
||||
|
||||
proteinlmbench_datasets = [proteinlmbench_dataset]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .PubMedQA_llmjudge_gen_f00302 import PubMedQA_datasets
|
@ -0,0 +1,94 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets.PubMedQA import PubMedQADataset
|
||||
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
Question:\n
|
||||
{question}
|
||||
Options:\n
|
||||
{choices}
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
PubMedQA_datasets = []
|
||||
|
||||
PubMedQA_reader_cfg = dict(
|
||||
input_columns=['question', 'choices'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
PubMedQA_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
PubMedQA_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=PubMedQADataset,
|
||||
path='qiaojin/PubMedQA',
|
||||
reader_cfg=PubMedQA_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
PubMedQA_datasets.append(
|
||||
dict(
|
||||
abbr=f'PubMedQA',
|
||||
type=PubMedQADataset,
|
||||
path='qiaojin/PubMedQA',
|
||||
reader_cfg=PubMedQA_reader_cfg,
|
||||
infer_cfg=PubMedQA_infer_cfg,
|
||||
eval_cfg=PubMedQA_eval_cfg,
|
||||
)
|
||||
)
|
@ -0,0 +1,65 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
from opencompass.datasets import SciEvalDataset
|
||||
|
||||
# 只评测 biology + multiple-choice 的 test split
|
||||
_hint = ('Given a question and four options, please select the right answer. '
|
||||
"Your answer should be 'A', 'B', 'C' or 'D'.")
|
||||
category = [
|
||||
'biology',
|
||||
]
|
||||
|
||||
scieval_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='test',
|
||||
)
|
||||
|
||||
scieval_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
scieval_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
)
|
||||
|
||||
scieval_datasets = [
|
||||
dict(
|
||||
abbr='scieval_biology',
|
||||
type=SciEvalDataset,
|
||||
path='OpenDFM/SciEval',
|
||||
name='default',
|
||||
category=category,
|
||||
reader_cfg=scieval_reader_cfg,
|
||||
infer_cfg=scieval_infer_cfg,
|
||||
eval_cfg=scieval_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,130 @@
|
||||
# SciEval_lifescience_llmjudge_gen.py
|
||||
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.utils.text_postprocessors import match_answer_pattern
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import SciEvalDataset
|
||||
|
||||
with read_base():
|
||||
from .SciEval_lifescience_sets import SciEval_lifescience_subsets
|
||||
|
||||
category = [
|
||||
'biology',
|
||||
]
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
|
||||
|
||||
{input}
|
||||
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
D) {D}
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {input}
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
D) {D}
|
||||
<Original Question End>
|
||||
|
||||
<Gold Target Begin>:
|
||||
{target}
|
||||
<Gold Target End>
|
||||
|
||||
<Predicted Answer Begin>:
|
||||
{prediction}
|
||||
<Predicted End>
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
scieval_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='test',
|
||||
)
|
||||
|
||||
scieval_datasets = []
|
||||
for name in SciEval_lifescience_subsets:
|
||||
scieval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
scieval_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt=(
|
||||
'You are a helpful assistant who evaluates the correctness '
|
||||
"and quality of models' outputs."
|
||||
),
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=SciEvalDataset,
|
||||
path='OpenDFM/SciEval',
|
||||
name='default',
|
||||
reader_cfg=scieval_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
scieval_datasets.append(
|
||||
dict(
|
||||
abbr=f'scieval_lifescience_{name}_llmjudge',
|
||||
type=SciEvalDataset,
|
||||
path='OpenDFM/SciEval',
|
||||
name='default',
|
||||
category=category,
|
||||
reader_cfg=scieval_reader_cfg,
|
||||
infer_cfg=scieval_infer_cfg,
|
||||
eval_cfg=scieval_eval_cfg,
|
||||
mode='singlescore',
|
||||
)
|
||||
)
|
@ -0,0 +1,6 @@
|
||||
SciEval_lifescience_subsets = [
|
||||
'biology', # 大学生物学
|
||||
'physics',
|
||||
'chemistry'
|
||||
|
||||
]
|
@ -0,0 +1,92 @@
|
||||
from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
ZERO_SHOT_PROMPT = '{q4}'
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
'question',
|
||||
'choices',
|
||||
'label',
|
||||
'answerKey',
|
||||
'type',
|
||||
'domain',
|
||||
'details',
|
||||
'answer',
|
||||
'q4'
|
||||
],
|
||||
output_column='answerKey',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=SciKnowEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
)
|
||||
sciknoweval_dataset_biology = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_biology',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
subset='biology',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
sciknoweval_dataset_chemistry = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_chemistry',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
subset='chemistry',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
sciknoweval_dataset_material = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_material',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
subset='material',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
sciknoweval_dataset_physics = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_physics',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
subset='physics',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
)
|
||||
|
||||
|
||||
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
|
@ -0,0 +1,232 @@
|
||||
from opencompass.datasets import SciKnowEvalDataset
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
ZERO_SHOT_PROMPT = '{q4}'
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: Q: {q4}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answerKey}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
# Reader configuration
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
'question',
|
||||
'choices',
|
||||
'label',
|
||||
'answerKey',
|
||||
'type',
|
||||
'domain',
|
||||
'details',
|
||||
'answer',
|
||||
'q4'
|
||||
],
|
||||
output_column='answerKey',
|
||||
)
|
||||
|
||||
# Inference configuration
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
|
||||
role='HUMAN',
|
||||
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg_biology = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=SciKnowEvalDataset,
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
subset='biology',
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
eval_cfg_chemistry = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=SciKnowEvalDataset,
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
subset='chemistry',
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
eval_cfg_material = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=SciKnowEvalDataset,
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
subset='material',
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
eval_cfg_physics = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=SciKnowEvalDataset,
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
subset='physics',
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
sciknoweval_dataset_biology = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_biology_llmjudge',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
subset='biology',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg_biology,
|
||||
)
|
||||
|
||||
sciknoweval_dataset_chemistry = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_chemistry_llmjudge',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
subset='chemistry',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg_chemistry,
|
||||
)
|
||||
sciknoweval_dataset_material = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_material_llmjudge',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
subset='material',
|
||||
prompt_mode='zero-shot',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg_material,
|
||||
)
|
||||
|
||||
|
||||
sciknoweval_dataset_physics = dict(
|
||||
type=SciKnowEvalDataset,
|
||||
abbr='sciknoweval_physics_llmjudge',
|
||||
path='hicai-zju/SciKnowEval',
|
||||
prompt_mode='zero-shot',
|
||||
subset='physics',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg_physics,
|
||||
)
|
||||
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .ScienceQA_llmjudge_gen_f00302 import ScienceQA_datasets
|
@ -0,0 +1,94 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets.ScienceQA import ScienceQADataset
|
||||
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
Question:\n
|
||||
{question}
|
||||
Options:\n
|
||||
{choices}
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
ScienceQA_datasets = []
|
||||
|
||||
ScienceQA_reader_cfg = dict(
|
||||
input_columns=['question', 'choices'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
ScienceQA_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
ScienceQA_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=ScienceQADataset,
|
||||
path='derek-thomas/ScienceQA',
|
||||
reader_cfg=ScienceQA_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
)
|
||||
|
||||
ScienceQA_datasets.append(
|
||||
dict(
|
||||
abbr=f'ScienceQA',
|
||||
type=ScienceQADataset,
|
||||
path='derek-thomas/ScienceQA',
|
||||
reader_cfg=ScienceQA_reader_cfg,
|
||||
infer_cfg=ScienceQA_infer_cfg,
|
||||
eval_cfg=ScienceQA_eval_cfg,
|
||||
)
|
||||
)
|
@ -0,0 +1,10 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_0shot_instruct import nc_0shot_instruct_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_0_shot_instruct import pp_acc_datasets_0shot_instruct
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_0shot_instruct import pp_rmse_0shot_instruct_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_0shot_instruct import fts_0shot_instruct_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_0shot_instruct import meteor_0shot_instruct_datasets
|
||||
|
||||
smolinstruct_datasets_0shot_instruct = nc_0shot_instruct_datasets + pp_rmse_0shot_instruct_datasets + pp_acc_datasets_0shot_instruct + meteor_0shot_instruct_datasets + fts_0shot_instruct_datasets
|
@ -0,0 +1,55 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import FTSEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
fts_0shot_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
fts_hint_dict = {
|
||||
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
|
||||
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
|
||||
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
|
||||
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain the SMILES representation of the predicted product wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
|
||||
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
|
||||
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and all reactants and reagents should be enclosed **together** within a single pair of <SMILES> and </SMILES> tags, separated by ".". Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'MG': 'molecule_generation',
|
||||
'FS': 'forward_synthesis',
|
||||
'RS': 'retrosynthesis'
|
||||
}
|
||||
|
||||
fts_0shot_instruct_datasets = []
|
||||
for _name in name_dict:
|
||||
_hint = fts_hint_dict[_name]
|
||||
fts_0shot_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
|
||||
# template=f'<s>[INST] {{input}} [/INST]',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
fts_0shot_eval_cfg = dict(
|
||||
evaluator=dict(type=FTSEvaluator),
|
||||
)
|
||||
|
||||
fts_0shot_instruct_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}-0shot-instruct',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=fts_0shot_reader_cfg,
|
||||
infer_cfg=fts_0shot_infer_cfg,
|
||||
eval_cfg=fts_0shot_eval_cfg,
|
||||
))
|
||||
|
||||
del _name
|
@ -0,0 +1,73 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import FTSEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
fts_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
fts_hint_dict = {
|
||||
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
|
||||
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
|
||||
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
|
||||
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain only the SMILES representation of the predicted product and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
|
||||
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and different reactants and reagents should be separated by ".". Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'MG': 'molecule_generation',
|
||||
'FS': 'forward_synthesis',
|
||||
'RS': 'retrosynthesis'
|
||||
}
|
||||
|
||||
fts_datasets = []
|
||||
for _name in fts_hint_dict:
|
||||
_hint = fts_hint_dict[_name]
|
||||
fts_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{output}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
fts_eval_cfg = dict(
|
||||
evaluator=dict(type=FTSEvaluator),
|
||||
)
|
||||
|
||||
fts_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=fts_reader_cfg,
|
||||
infer_cfg=fts_infer_cfg,
|
||||
eval_cfg=fts_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,10 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_gen_c84c18 import nc_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_gen_8607a3 import pp_acc_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_gen_0fcc6b import pp_rmse_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_gen_5774b5 import fts_datasets
|
||||
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_gen_065150 import meteor_datasets
|
||||
|
||||
smolinstruct_datasets = nc_datasets + pp_rmse_datasets + pp_acc_datasets + meteor_datasets + fts_datasets
|
@ -0,0 +1,49 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import MeteorEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
meteor_0shot_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
meteor_hint_dict = {
|
||||
'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
|
||||
The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'MC': 'molecule_captioning',
|
||||
}
|
||||
|
||||
meteor_0shot_instruct_datasets = []
|
||||
for _name in name_dict:
|
||||
_hint = meteor_hint_dict[_name]
|
||||
meteor_0shot_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
|
||||
# template=f'<s>[INST] {{input}} [/INST]',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
meteor_0shot_eval_cfg = dict(
|
||||
evaluator=dict(type=MeteorEvaluator),
|
||||
)
|
||||
|
||||
meteor_0shot_instruct_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}-0shot-instruct',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=meteor_0shot_reader_cfg,
|
||||
infer_cfg=meteor_0shot_infer_cfg,
|
||||
eval_cfg=meteor_0shot_eval_cfg,
|
||||
))
|
||||
|
||||
del _name
|
@ -0,0 +1,67 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import MeteorEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
meteor_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
meteor_hint_dict = {
|
||||
'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
|
||||
The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'MC': 'molecule_captioning',
|
||||
}
|
||||
|
||||
meteor_datasets = []
|
||||
for _name in meteor_hint_dict:
|
||||
_hint = meteor_hint_dict[_name]
|
||||
meteor_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{output}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
meteor_eval_cfg = dict(
|
||||
evaluator=dict(type=MeteorEvaluator),
|
||||
)
|
||||
|
||||
meteor_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=meteor_reader_cfg,
|
||||
infer_cfg=meteor_infer_cfg,
|
||||
eval_cfg=meteor_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,63 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
||||
|
||||
nc_0shot_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
nc_hint_dict = {
|
||||
'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
|
||||
The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
|
||||
The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'I2F': 'name_conversion-i2f',
|
||||
'I2S': 'name_conversion-i2s',
|
||||
'S2F': 'name_conversion-s2f',
|
||||
'S2I': 'name_conversion-s2i',
|
||||
}
|
||||
|
||||
nc_0shot_instruct_datasets = []
|
||||
for _name in name_dict:
|
||||
_hint = nc_hint_dict[_name]
|
||||
nc_0shot_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
|
||||
# template=f'<s>[INST] {{input}} [/INST]',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
if _name in ['I2F', 'S2F']:
|
||||
nc_0shot_eval_cfg = dict(
|
||||
evaluator=dict(type=NCElementMatchEvaluator),
|
||||
)
|
||||
else:
|
||||
nc_0shot_eval_cfg = dict(
|
||||
evaluator=dict(type=NCExactMatchEvaluator),
|
||||
)
|
||||
|
||||
nc_0shot_instruct_datasets.append(
|
||||
dict(
|
||||
abbr=f'NC-{_name}-0shot-instruct',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=nc_0shot_reader_cfg,
|
||||
infer_cfg=nc_0shot_infer_cfg,
|
||||
eval_cfg=nc_0shot_eval_cfg,
|
||||
))
|
||||
|
||||
del _name
|
@ -0,0 +1,93 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
||||
|
||||
nc_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
nc_hint_dict = {
|
||||
'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
|
||||
The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
|
||||
The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'I2F': 'name_conversion-i2f',
|
||||
'I2S': 'name_conversion-i2s',
|
||||
'S2F': 'name_conversion-s2f',
|
||||
'S2I': 'name_conversion-s2i',
|
||||
}
|
||||
|
||||
nc_datasets = []
|
||||
for _name in nc_hint_dict:
|
||||
_hint = nc_hint_dict[_name]
|
||||
nc_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{output}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
# nc_infer_cfg = dict(
|
||||
# prompt_template=dict(
|
||||
# type=PromptTemplate,
|
||||
# template=dict(
|
||||
# round=[
|
||||
# dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
|
||||
# ],
|
||||
# ),
|
||||
# ),
|
||||
# retriever=dict(type=ZeroRetriever),
|
||||
# inferencer=dict(type=GenInferencer),
|
||||
# )
|
||||
if _name in ['I2F', 'S2F']:
|
||||
nc_eval_cfg = dict(
|
||||
evaluator=dict(type=NCElementMatchEvaluator),
|
||||
)
|
||||
else:
|
||||
nc_eval_cfg = dict(
|
||||
evaluator=dict(type=NCExactMatchEvaluator),
|
||||
)
|
||||
|
||||
nc_datasets.append(
|
||||
dict(
|
||||
abbr=f'NC-{_name}',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=nc_reader_cfg,
|
||||
infer_cfg=nc_infer_cfg,
|
||||
eval_cfg=nc_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,61 @@
|
||||
from opencompass.openicl import AccEvaluator
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
from opencompass.datasets.smolinstruct import smolinstruct_acc_0shot_postprocess
|
||||
|
||||
pp_acc_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
pp_acc_hint_dict = {
|
||||
'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
|
||||
The input contains the compound. Your reply should only contain Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'BBBP': 'property_prediction-bbbp',
|
||||
'ClinTox': 'property_prediction-clintox',
|
||||
'HIV': 'property_prediction-hiv',
|
||||
'SIDER': 'property_prediction-sider',
|
||||
}
|
||||
|
||||
pp_acc_datasets_0shot_instruct = []
|
||||
for _name in pp_acc_hint_dict:
|
||||
_hint = pp_acc_hint_dict[_name]
|
||||
|
||||
pp_acc_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
|
||||
# template=f'<s>[INST] {{input}} [/INST]',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
pp_acc_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=smolinstruct_acc_0shot_postprocess)
|
||||
)
|
||||
|
||||
pp_acc_datasets_0shot_instruct.append(
|
||||
dict(
|
||||
abbr=f'PP-{_name}-0shot-instruct',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=pp_acc_reader_cfg,
|
||||
infer_cfg=pp_acc_infer_cfg,
|
||||
eval_cfg=pp_acc_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,79 @@
|
||||
from opencompass.openicl import AccEvaluator
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
from opencompass.datasets.smolinstruct import smolinstruct_acc_postprocess
|
||||
|
||||
pp_acc_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
pp_acc_hint_dict = {
|
||||
'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
|
||||
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'BBBP': 'property_prediction-bbbp',
|
||||
'ClinTox': 'property_prediction-clintox',
|
||||
'HIV': 'property_prediction-hiv',
|
||||
'SIDER': 'property_prediction-sider',
|
||||
}
|
||||
|
||||
pp_acc_datasets = []
|
||||
for _name in pp_acc_hint_dict:
|
||||
_hint = pp_acc_hint_dict[_name]
|
||||
|
||||
pp_acc_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{output}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
pp_acc_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=smolinstruct_acc_postprocess)
|
||||
)
|
||||
|
||||
pp_acc_datasets.append(
|
||||
dict(
|
||||
abbr=f'PP-{_name}',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=pp_acc_reader_cfg,
|
||||
infer_cfg=pp_acc_infer_cfg,
|
||||
eval_cfg=pp_acc_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,52 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import RMSEEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
pp_rmse_0shot_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
pp_rmse_hint_dict = {
|
||||
'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable.""",
|
||||
'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable."""
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'ESOL': 'property_prediction-esol',
|
||||
'Lipo': 'property_prediction-lipo'
|
||||
}
|
||||
|
||||
pp_rmse_0shot_instruct_datasets = []
|
||||
for _name in name_dict:
|
||||
_hint = pp_rmse_hint_dict[_name]
|
||||
pp_rmse_0shot_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{_hint}\nQuestion: {{input}}\nAnswer: ',
|
||||
# template=f'<s>[INST] {{input}} [/INST]',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
pp_rmse_0shot_eval_cfg = dict(
|
||||
evaluator=dict(type=RMSEEvaluator),
|
||||
)
|
||||
|
||||
pp_rmse_0shot_instruct_datasets.append(
|
||||
dict(
|
||||
abbr=f'PP-{_name}-0shot-instruct',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=pp_rmse_0shot_reader_cfg,
|
||||
infer_cfg=pp_rmse_0shot_infer_cfg,
|
||||
eval_cfg=pp_rmse_0shot_eval_cfg,
|
||||
))
|
||||
|
||||
del _name
|
@ -0,0 +1,70 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.smolinstruct import RMSEEvaluator
|
||||
from opencompass.datasets import SmolInstructDataset
|
||||
|
||||
pp_rmse_reader_cfg = dict(
|
||||
input_columns=['input'],
|
||||
output_column='output',
|
||||
train_split='validation')
|
||||
|
||||
pp_rmse_hint_dict = {
|
||||
'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable.""",
|
||||
'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
|
||||
The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable."""
|
||||
}
|
||||
|
||||
name_dict = {
|
||||
'ESOL': 'property_prediction-esol',
|
||||
'Lipo': 'property_prediction-lipo'
|
||||
}
|
||||
|
||||
pp_rmse_datasets = []
|
||||
for _name in pp_rmse_hint_dict:
|
||||
_hint = pp_rmse_hint_dict[_name]
|
||||
pp_rmse_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{output}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
pp_rmse_eval_cfg = dict(
|
||||
evaluator=dict(type=RMSEEvaluator),
|
||||
)
|
||||
|
||||
pp_rmse_datasets.append(
|
||||
dict(
|
||||
abbr=f'PP-{_name}',
|
||||
type=SmolInstructDataset,
|
||||
path='osunlp/SMolInstruct',
|
||||
name=name_dict[_name],
|
||||
reader_cfg=pp_rmse_reader_cfg,
|
||||
infer_cfg=pp_rmse_infer_cfg,
|
||||
eval_cfg=pp_rmse_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,118 @@
|
||||
"""
|
||||
Summary: A config for AIME-2024 Evaluation.
|
||||
Setting:
|
||||
Shot: 0-shot
|
||||
Evaluator:
|
||||
- CascadeEvaluator
|
||||
- MATHVerifyEvaluator
|
||||
- GenericLLMEvaluator
|
||||
Repeat: 1
|
||||
Avaliable Models:
|
||||
- Instruct/Chat Models
|
||||
"""
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import Aime2024Dataset
|
||||
from opencompass.evaluator import (
|
||||
CascadeEvaluator,
|
||||
GenericLLMEvaluator,
|
||||
MATHVerifyEvaluator
|
||||
)
|
||||
|
||||
|
||||
aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
cascade_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
rule_evaluator=dict(
|
||||
type=MATHVerifyEvaluator,
|
||||
),
|
||||
llm_evaluator= dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
parallel=False,
|
||||
)
|
||||
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
evaluator=cascade_evaluator,
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
n=1,# Evaluate the dataset with n times
|
||||
)
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .aime2024_gen_6e39a4 import aime2024_datasets # noqa: F401, F403
|
||||
from .aime2024_gen_17d799 import aime2024_datasets # noqa: F401, F403
|
40
opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
Normal file
40
opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
Normal file
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import MATHEvaluator
|
||||
from opencompass.datasets import Aime2024Dataset
|
||||
|
||||
|
||||
aime2024_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
)
|
||||
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer)
|
||||
)
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator)
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets # noqa: F401, F403
|
@ -1,7 +1,7 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.datasets import Aime2024Dataset
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
@ -69,8 +69,8 @@ aime2024_eval_cfg = dict(
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
@ -81,8 +81,8 @@ aime2024_eval_cfg = dict(
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
|
@ -0,0 +1,115 @@
|
||||
"""
|
||||
Summary: A config for AIME-2025 Evaluation.
|
||||
Setting:
|
||||
Shot: 0-shot
|
||||
Evaluator:
|
||||
- CascadeEvaluator
|
||||
- MATHVerifyEvaluator
|
||||
- GenericLLMEvaluator
|
||||
Repeat: 1
|
||||
Avaliable Models:
|
||||
- Instruct/Chat Models
|
||||
"""
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.evaluator import (
|
||||
CascadeEvaluator,
|
||||
GenericLLMEvaluator,
|
||||
MATHVerifyEvaluator
|
||||
)
|
||||
|
||||
aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
|
||||
aime2025_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
cascade_evaluator = dict(
|
||||
type=CascadeEvaluator,
|
||||
rule_evaluator=dict(
|
||||
type=MATHVerifyEvaluator,
|
||||
),
|
||||
llm_evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
reader_cfg=aime2025_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
parallel=False,
|
||||
)
|
||||
aime2025_eval_cfg = dict(
|
||||
evaluator=cascade_evaluator,
|
||||
)
|
||||
|
||||
aime2025_datasets = [
|
||||
dict(
|
||||
type=CustomDataset,
|
||||
abbr='aime2025',
|
||||
path='opencompass/aime2025',
|
||||
reader_cfg=aime2025_reader_cfg,
|
||||
infer_cfg=aime2025_infer_cfg,
|
||||
eval_cfg=aime2025_eval_cfg,
|
||||
n=1,
|
||||
)
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403
|
||||
from .bbh_gen_ee62e9 import bbh_datasets # noqa: F401, F403
|
99
opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
Normal file
99
opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
Normal file
@ -0,0 +1,99 @@
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
bbh_datasets = []
|
||||
for _name in bbh_multiple_choice_sets:
|
||||
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
||||
_hint = f.read()
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
for _name in bbh_free_form_sets:
|
||||
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
||||
_hint = f.read()
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
4
opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
Normal file
4
opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets # noqa: F401, F403
|
@ -0,0 +1,44 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
|
||||
|
||||
bigcodebench_full_reader_cfg = dict(
|
||||
input_columns=['instruct_prompt'],
|
||||
output_column='test',
|
||||
)
|
||||
|
||||
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{instruct_prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
bigcodebench_full_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=BigCodeBenchEvaluator,
|
||||
release_version='v0.1.2',
|
||||
eval_type='instruct',
|
||||
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
|
||||
remote_execute_api=
|
||||
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
|
||||
dataset_version='full',
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
bigcodebench_full_instruct_datasets = [
|
||||
dict(abbr='bigcodebench_full_instruct',
|
||||
type=BigCodeBenchDataset,
|
||||
path='opencompass/bigcodebench',
|
||||
reader_cfg=bigcodebench_full_reader_cfg,
|
||||
infer_cfg=bigcodebench_full_infer_cfg,
|
||||
eval_cfg=bigcodebench_full_eval_cfg,
|
||||
release_version='v0.1.2',
|
||||
n=5,
|
||||
k=3)
|
||||
]
|
@ -0,0 +1,7 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets
|
||||
from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets
|
||||
|
||||
bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), [])
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user