Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2025-04-09 16:26:07 +08:00 committed by GitHub
commit 975e4bcadf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
260 changed files with 14112 additions and 2840 deletions

View File

@ -25,8 +25,8 @@ models = [
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://localhost:23333/v1',
path='internlm2',
tokenizer_path='internlm/internlm2_5-7b-chat',
path='internlm3',
tokenizer_path='internlm/internlm3-8b-instruct',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,

View File

@ -11,18 +11,10 @@ with read_base():
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
models as lmdeploy_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
models as hf_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
models as hf_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -49,12 +41,6 @@ with read_base():
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
models as hf_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@ -65,14 +51,14 @@ with read_base():
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
models as hf_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \

View File

@ -15,14 +15,24 @@ with read_base():
models as vllm_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
models as hf_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
models as \
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
models as \
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
models as \
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
models as \
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@ -45,6 +55,8 @@ with read_base():
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@ -57,6 +69,8 @@ with read_base():
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@ -83,10 +97,6 @@ with read_base():
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
models as \
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
@ -95,14 +105,19 @@ with read_base():
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
models as \
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
models as \
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_4 import \
models as hf_phi_4_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@ -142,6 +157,8 @@ with read_base():
from ...volc import infer as volc_infer # noqa: F401, E501
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -175,10 +175,11 @@ class TestApibench:
class TestVolcFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
] for p2 in dataset_list(p1, 'objective')])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
@ -245,10 +246,7 @@ class TestCmdCase:
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
('internlm2_5-7b-hf', 'race-high_accuracy'),
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
('internlm2-1.8b-hf', 'race-middle_accuracy'),
('internlm2-1.8b-hf', 'race-high_accuracy'),
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
@ -260,9 +258,9 @@ class TestCmdCase:
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
@ -280,13 +278,25 @@ class TestCmdCase:
@pytest.mark.case4
@pytest.mark.parametrize(
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
'model, dataset',
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score, dataset)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.case5
@pytest.mark.parametrize(
'model, dataset',
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score, dataset)
def assert_score(model_type, score, baseline, dataset: str = ''):

View File

@ -8,20 +8,25 @@ internlm2_5-7b_hf:
race-middle_accuracy: 91.78
race-high_accuracy: 90.02
internlm2-1.8b-hf:
demo_gsm8k_accuracy: 15.62
race-middle_accuracy: 71.66
race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy:
demo_gsm8k_accuracy: 89.06
demo_gsm8k_accuracy: 87.50
race-middle_accuracy: 92.76
race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy:
demo_gsm8k_accuracy: 31
race-middle_accuracy: 81.34
race-high_accuracy: 73.96
internlm3-8b-instruct-lmdeploy:
demo_gsm8k_accuracy: 73.44
race-middle_accuracy: 93.38
race-high_accuracy: 90.34
internlm3-8b-instruct_hf-lmdeploy:
demo_gsm8k_accuracy: 73.44
race-middle_accuracy: 93.38
race-high_accuracy: 90.34
internlm3-8b-instruct_hf-vllm:
demo_gsm8k_accuracy: 81.25
race-middle_accuracy: 92.20
race-high_accuracy: 89.88
internlm2_5-7b-chat_hf:
demo_gsm8k_accuracy: 87.50
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
race-high_accuracy: 90.48
lmdeploy-api-test:
gsm8k_accuracy: 68.75
race-middle_accuracy: 87.50
gsm8k_accuracy: 56.25
race-middle_accuracy: 93.75
race-high_accuracy: 93.75

View File

@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
drop_accuracy: 81.25
GPQA_diamond_accuracy: 25
hellaswag_accuracy: 87.5
TheoremQA_score: 18.75
TheoremQA_score: 12.50
musr_average_naive_average: 39.58
korbench_single_naive_average: 40
gsm8k_accuracy: 62.50
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
lcb_test_output_pass@1: 18.75
bbh-logical_deduction_seven_objects_score: 50
bbh-multistep_arithmetic_two_score: 68.75
mmlu-other_naive_average: 72.6
cmmlu-china-specific_naive_average: 76.25
mmlu-other_accuracy: 72.6
cmmlu-china-specific_accuracy: 76.25
mmlu_pro_math_accuracy: 25
ds1000_Pandas_accuracy: 12.5
ds1000_Numpy_accuracy: 0
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
college_knowledge_naive_average: 87.5
subjective:
alignment_bench_v1_1_总分: 0.66
alpaca_eval_total: 20
alpaca_eval_total: 0
arenahard_score: 50
Followbench_naive_average: 1
CompassArena_naive_average: 44.00
CompassArena_naive_average: 43
mtbench101_avg: 7.8
wildbench_average: -12.78
wildbench_average: -15.56
simpleqa_accuracy_given_attempted: 0
chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_专业能力: 7.90
alignment_bench_v1_1_专业能力: 8.00
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
@ -55,10 +55,10 @@ internlm2_5-7b-chat-hf_fullbench:
alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 20
alpaca_eval_helpful_base: 0
compassarena_language_naive_average: 35
compassarena_knowledge_naive_average: 55
compassarena_reason_v2_naive_average: 45.00
compassarena_reason_v2_naive_average: 40
compassarena_math_v2_naive_average: 55
compassarena_creationv2_zh_naive_average: 30
followbench_llmeval_en_HSR_AVG: 1
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
internlm2_5-7b-chat-turbomind_fullbench:
objective:
race-high_accuracy: 93.75
ARC-c_accuracy: 93.75
ARC-c_accuracy: 87.50
BoolQ_accuracy: 68.75
triviaqa_wiki_1shot_score: 50
nq_open_1shot_score: 25
IFEval_Prompt-level-strict-accuracy: 56.25
drop_accuracy: 81.25
drop_accuracy: 75
GPQA_diamond_accuracy: 31.25
hellaswag_accuracy: 81.25
TheoremQA_score: 6.25
hellaswag_accuracy: 87.5
TheoremQA_score: 12.5
musr_average_naive_average: 39.58
korbench_single_naive_average: 37.50
gsm8k_accuracy: 68.75
math_accuracy: 68.75
korbench_single_naive_average: 40
gsm8k_accuracy: 62.5
math_accuracy: 75
cmo_fib_accuracy: 6.25
aime2024_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 68.75
ds1000_naive_average: 16.96
ds1000_naive_average: 17.86
lcb_code_generation_pass@1: 12.5
lcb_code_execution_pass@1: 43.75
lcb_test_output_pass@1: 25.00
bbh-logical_deduction_seven_objects_score: 50.00
bbh-multistep_arithmetic_two_score: 68.75
mmlu-other_naive_average: 69.71
cmmlu-china-specific_naive_average: 75.83
lcb_test_output_pass@1: 18.75
bbh-logical_deduction_seven_objects_score: 56.25
bbh-multistep_arithmetic_two_score: 75
mmlu-other_accuracy: 72.6
cmmlu-china-specific_accuracy: 78.33
mmlu_pro_math_accuracy: 31.25
ds1000_Pandas_accuracy: 0
ds1000_Pandas_accuracy: 12.5
ds1000_Numpy_accuracy: 0
ds1000_Tensorflow_accuracy: 12.5
ds1000_Scipy_accuracy: 18.75
ds1000_Scipy_accuracy: 25
ds1000_Sklearn_accuracy: 18.75
ds1000_Pytorch_accuracy: 18.75
ds1000_Pytorch_accuracy: 6.25
ds1000_Matplotlib_accuracy: 50.00
openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_naive_average: 12.50
college_knowledge_naive_average: 87.5
subjective:
alignment_bench_v1_1_总分: 0.70
alignment_bench_v1_1_总分: 0.66
alpaca_eval_total: 0
arenahard_score: 50
Followbench_naive_average: 1
CompassArena_naive_average: 38
mtbench101_avg: 7.80
wildbench_average: -4.86
CompassArena_naive_average: 40
mtbench101_avg: 8
wildbench_average: -6.81
simpleqa_accuracy_given_attempted: 0
chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_专业能力: 8.4
alignment_bench_v1_1_专业能力: 7.9
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 0
compassarena_language_naive_average: 35
compassarena_knowledge_naive_average: 50
compassarena_reason_v2_naive_average: 30
compassarena_math_v2_naive_average: 50
compassarena_creationv2_zh_naive_average: 25
compassarena_knowledge_naive_average: 45
compassarena_reason_v2_naive_average: 25
compassarena_math_v2_naive_average: 60
compassarena_creationv2_zh_naive_average: 35
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75
TheoremQA_score: 25
TheoremQA_score: 12.50
winogrande_accuracy: 75
gsm8k_accuracy: 37.5
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75
TheoremQA_score: 25.00
TheoremQA_score: 12.50
winogrande_accuracy: 87.5
gsm8k_accuracy: 62.50
GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
gsm8k_accuracy: 56.25
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
math_accuracy: 18.75
wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 62.50
dingo_en_192_score: 31.25
dingo_en_192_score: 50.00
dingo_zh_170_score: 93.75
mmlu-other_accuracy: 76.92
cmmlu-china-specific_accuracy: 84.17
mmlu_pro_math_accuracy: 18.75
bbh-logical_deduction_seven_objects_score: 50
bbh-logical_deduction_seven_objects_score: 43.75
bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
sanitized_mbpp_score: 55.25
dingo_en_192_score: 60.94
dingo_zh_170_score: 67.65
mmlu-stem_naive_average: 63.72
mmlu-social-science_naive_average: 80.15
mmlu-humanities_naive_average: 74.27
mmlu-other_naive_average: 71.85
cmmlu-stem_naive_average: 67.07
cmmlu-social-science_naive_average: 81.49
cmmlu-humanities_naive_average: 85.84
cmmlu-other_naive_average: 82.69
cmmlu-china-specific_naive_average: 79.88
mmlu-stem_accuracy: 63.72
mmlu-social-science_accuracy: 80.15
mmlu-humanities_accuracy: 74.27
mmlu-other_accuracy: 71.85
cmmlu-stem_accuracy: 67.07
cmmlu-social-science_accuracy: 81.49
cmmlu-humanities_accuracy: 85.84
cmmlu-other_accuracy: 82.69
cmmlu-china-specific_accuracy: 79.88
mmlu_pro_biology_accuracy: 58.58
mmlu_pro_business_accuracy: 28.01
mmlu_pro_chemistry_accuracy: 22.79
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
longbench_naive_average: 46.19
longbench_zh_naive_average: 49.3
longbench_en_naive_average: 43.97
longbench_single-document-qa_naive_average: 42.84
longbench_multi-document-qa_naive_average: 37.29
longbench_summarization_naive_average: 23.21
longbench_few-shot-learning_naive_average: 61.67
longbench_synthetic-tasks_naive_average: 60.05
longbench_code-completion_naive_average: 52.09
longbench_single-document-qa_score: 42.84
longbench_multi-document-qa_score: 41.25
longbench_summarization_score: 23.21
longbench_few-shot-learning_score: 61.67
longbench_synthetic-tasks_score: 60.05
longbench_code-completion_score: 52.09
internlm2_5-7b-chat-turbomind:
objective:
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
teval_naive_average: 80
SciCode_sub_accuracy: 5.56
qa_dingo_cn_score: 99.01
mmlu-stem_naive_average: 68.2
mmlu-social-science_naive_average: 75.8
mmlu-humanities_naive_average: 69.3
mmlu-other_naive_average: 71.3
cmmlu-stem_naive_average: 66.64
cmmlu-social-science_naive_average: 76
cmmlu-humanities_naive_average: 77.9
cmmlu-other_naive_average: 77.25
cmmlu-china-specific_naive_average: 73.6
mmlu-stem_accuracy: 68.2
mmlu-social-science_accuracy: 75.8
mmlu-humanities_accuracy: 69.3
mmlu-other_accuracy: 71.3
cmmlu-stem_accuracy: 66.64
cmmlu-social-science_accuracy: 76
cmmlu-humanities_accuracy: 77.9
cmmlu-other_accuracy: 77.25
cmmlu-china-specific_accuracy: 73.6
mmlu_pro_biology_accuracy: 66.67
mmlu_pro_business_accuracy: 47.91
mmlu_pro_chemistry_accuracy: 35
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
openai_mmmlu_lite_DE-DE_accuracy: 51.27
openai_mmmlu_lite_ES-LA_accuracy: 56.94
openai_mmmlu_lite_FR-FR_accuracy: 58.22
openai_mmmlu_lite_HI-IN_accuracy: 33.75
openai_mmmlu_lite_HI-IN_accuracy: 30.75
openai_mmmlu_lite_ID-ID_accuracy: 50.6
openai_mmmlu_lite_IT-IT_accuracy: 50.6
openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -391,10 +391,10 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_total: 25.96
arenahard_score: 17.15
Followbench_naive_average: 0.81
CompassArena_naive_average: 34.61
CompassArena_naive_average: 39.49
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -15.69
wildbench_average: -10.49
simpleqa_accuracy_given_attempted: 0.04
chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_专业能力: 6.05
@ -409,12 +409,12 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_koala: 28.21
alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 25
compassarena_language_naive_average: 52.5
alpaca_eval_vicuna: 33.75
compassarena_language_naive_average: 58.50
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 35.81
compassarena_math_v2_naive_average: 25.95
compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73
@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
babilong_32k_naive_average: 48.9
babilong_128k_naive_average: 40.8
babilong_256k_naive_average: 23.5
longbench_single-document-qa_naive_average: 43.56
longbench_multi-document-qa_naive_average: 46.24
longbench_summarization_naive_average: 24.32
longbench_few-shot-learning_naive_average: 51.67
longbench_synthetic-tasks_naive_average: 66.83
longbench_code-completion_naive_average: 45.99
longbench_single-document-qa_score: 43.56
longbench_multi-document-qa_score: 46.24
longbench_summarization_score: 24.32
longbench_few-shot-learning_score: 51.67
longbench_synthetic-tasks_score: 66.83
longbench_code-completion_score: 45.99
qwen2.5-7b-instruct-turbomind:
objective:
race-high_accuracy: 84.99
ARC-c_accuracy: 92.2
BoolQ_accuracy: 86.7
triviaqa_wiki_1shot_score: 53.06
nq_open_1shot_score: 17.51
mmmlu_lite_naive_average: 54.96
IFEval_Prompt-level-strict-accuracy: 71.53
drop_accuracy: 80.07
bbh_naive_average: 68.81
GPQA_diamond_accuracy: 34.34
hellaswag_accuracy: 85.42
TheoremQA_score: 18.38
musr_average_naive_average: 43.44
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 92.57
GaokaoBench_weighted_average: 80.14
math_accuracy: 73.58
cmo_fib_accuracy: 25
aime2024_accuracy: 16.67
Mathbench_naive_average: 77.33
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
cmmlu_naive_average: 75.97
mmlu_naive_average: 76.01
mmlu_pro_naive_average: 56.12
openai_humaneval_humaneval_pass@1: 83.54
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.29
ds1000_naive_average: 18.66
lcb_code_generation_pass@1: 39.5
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.68
bigcodebench_hard_instruct_pass@1: 16.22
bigcodebench_hard_complete_pass@1: 11.49
teval_naive_average: 79.72
SciCode_sub_accuracy: 10.76
qa_dingo_cn_score: 99.01
mmlu_accuracy: 76.01
mmlu-stem_accuracy: 77.59
mmlu-social-science_accuracy: 79.02
mmlu-humanities_accuracy: 72.07
mmlu-other_accuracy: 74.86
cmmlu_accuracy: 75.97
cmmlu-stem_accuracy: 73.09
cmmlu-social-science_accuracy: 75.95
cmmlu-humanities_accuracy: 76.53
cmmlu-other_accuracy: 78.79
cmmlu-china-specific_accuracy: 73.17
mmlu_pro_accuracy: 56.12
mmlu_pro_biology_accuracy: 71.41
mmlu_pro_business_accuracy: 67.68
mmlu_pro_chemistry_accuracy: 54.59
mmlu_pro_computer_science_accuracy: 58.29
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 42.41
mmlu_pro_health_accuracy: 55.87
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 28.97
mmlu_pro_math_accuracy: 73.13
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 58.43
mmlu_pro_psychology_accuracy: 63.16
mmlu_pro_other_accuracy: 53.57
humanevalx-python_pass@1: 50
humanevalx-cpp_pass@1: 42.07
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 75
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.18
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 10.43
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 60.65
mmmlu_lite_accuracy: 54.96
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.25
openai_mmmlu_lite_DE-DE_accuracy: 59.93
openai_mmmlu_lite_ES-LA_accuracy: 66.53
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 49.26
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.47
openai_mmmlu_lite_JA-JP_accuracy: 61.54
openai_mmmlu_lite_KO-KR_accuracy: 60.28
openai_mmmlu_lite_PT-BR_accuracy: 55.51
openai_mmmlu_lite_SW-KE_accuracy: 36.42
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
college_naive_average: 48
high_naive_average: 59
middle_naive_average: 78
primary_naive_average: 85.67
arithmetic_naive_average: 75.67
mathbench-a (average)_naive_average: 69.27
college_knowledge_naive_average: 83.86
high_knowledge_naive_average: 80.29
middle_knowledge_naive_average: 84.26
primary_knowledge_naive_average: 93.16
mathbench-t (average)_naive_average: 85.39
internlm2_5-7b-chat-pytorch:
objective:
race-high_accuracy: 86.39
ARC-c_accuracy: 90.51
BoolQ_accuracy: 88.01
triviaqa_wiki_1shot_score: 64.77
nq_open_1shot_score: 22.71
mmmlu_lite_naive_average: 45.02
IFEval_Prompt-level-strict-accuracy: 56.56
drop_accuracy: 75.46
bbh_naive_average: 73.34
GPQA_diamond_accuracy: 32.83
hellaswag_accuracy: 94.81
TheoremQA_score: 23.88
musr_average_naive_average: 51.31
korbench_single_naive_average: 32
ARC_Prize_Public_Evaluation_accuracy: 0.01
gsm8k_accuracy: 86.96
GaokaoBench_weighted_average: 78.05
math_accuracy: 60.34
cmo_fib_accuracy: 12.98
aime2024_accuracy: 3.33
Mathbench_naive_average: 64.82
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
cmmlu_naive_average: 74.24
mmlu_naive_average: 70.2
mmlu_pro_naive_average: 45.39
openai_humaneval_humaneval_pass@1: 70.12
sanitized_mbpp_score: 64.59
humanevalx_naive_average: 38.78
ds1000_naive_average: 14.19
lcb_code_generation_pass@1: 16.5
lcb_code_execution_pass@1: 33.82
lcb_test_output_pass@1: 22.62
bigcodebench_hard_instruct_pass@1: 6.08
bigcodebench_hard_complete_pass@1: 6.76
teval_naive_average: 79.73
SciCode_sub_accuracy: 3.47
qa_dingo_cn_score: 100
mmlu_accuracy: 70.2
mmlu-stem_accuracy: 67.73
mmlu-social-science_accuracy: 75.49
mmlu-humanities_accuracy: 68.56
mmlu-other_accuracy: 70.58
cmmlu_accuracy: 74.24
cmmlu-stem_accuracy: 66.7
cmmlu-social-science_accuracy: 75.88
cmmlu-humanities_accuracy: 77.56
cmmlu-other_accuracy: 77.52
cmmlu-china-specific_accuracy: 73.46
mmlu_pro_accuracy: 45.39
mmlu_pro_biology_accuracy: 65.83
mmlu_pro_business_accuracy: 51.96
mmlu_pro_chemistry_accuracy: 36.84
mmlu_pro_computer_science_accuracy: 48.29
mmlu_pro_economics_accuracy: 56.16
mmlu_pro_engineering_accuracy: 29.1
mmlu_pro_health_accuracy: 44.5
mmlu_pro_history_accuracy: 42.26
mmlu_pro_law_accuracy: 24.98
mmlu_pro_math_accuracy: 54.85
mmlu_pro_philosophy_accuracy: 39.28
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.27
mmlu_pro_other_accuracy: 45.78
humanevalx-python_pass@1: 56.1
humanevalx-cpp_pass@1: 20.73
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 59.15
humanevalx-js_pass@1: 57.93
ds1000_Pandas_accuracy: 8.93
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 7.55
ds1000_Sklearn_accuracy: 7.83
ds1000_Pytorch_accuracy: 8.82
ds1000_Matplotlib_accuracy: 50.97
mmmlu_lite_accuracy: 45.02
openai_mmmlu_lite_AR-XY_accuracy: 18.6
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.23
openai_mmmlu_lite_ES-LA_accuracy: 56.63
openai_mmmlu_lite_FR-FR_accuracy: 58.11
openai_mmmlu_lite_HI-IN_accuracy: 33.82
openai_mmmlu_lite_ID-ID_accuracy: 50.39
openai_mmmlu_lite_IT-IT_accuracy: 50.39
openai_mmmlu_lite_JA-JP_accuracy: 50.95
openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_PT-BR_accuracy: 57.89
openai_mmmlu_lite_SW-KE_accuracy: 32.14
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
college_naive_average: 21
high_naive_average: 47
middle_naive_average: 59.67
primary_naive_average: 76
arithmetic_naive_average: 62
mathbench-a (average)_naive_average: 53.13
college_knowledge_naive_average: 68.99
high_knowledge_naive_average: 70.06
middle_knowledge_naive_average: 78.53
primary_knowledge_naive_average: 88.49
mathbench-t (average)_naive_average: 76.51
qwen2.5-7b-instruct-pytorch:
objective:
race-high_accuracy: 85.16
ARC-c_accuracy: 90.85
BoolQ_accuracy: 86.61
triviaqa_wiki_1shot_score: 52.96
nq_open_1shot_score: 17.62
mmmlu_lite_naive_average: 54.7
IFEval_Prompt-level-strict-accuracy: 71.35
drop_accuracy: 80.23
bbh_naive_average: 68.88
GPQA_diamond_accuracy: 36.36
hellaswag_accuracy: 85.49
TheoremQA_score: 18.38
musr_average_naive_average: 43.3
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 91.66
GaokaoBench_weighted_average: 80.02
math_accuracy: 73.74
cmo_fib_accuracy: 26.44
aime2024_accuracy: 13.33
Mathbench_naive_average: 77.08
wikibench-wiki-single_choice_cncircular_perf_4: 34
cmmlu_naive_average: 75.9
mmlu_naive_average: 76.27
mmlu_pro_naive_average: 56.14
openai_humaneval_humaneval_pass@1: 84.76
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.17
ds1000_naive_average: 18.57
lcb_code_generation_pass@1: 38.75
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.45
bigcodebench_hard_instruct_pass@1: 16.89
bigcodebench_hard_complete_pass@1: 12.16
teval_naive_average: 79.46
SciCode_sub_accuracy: 10.42
qa_dingo_cn_score: 100
mmlu_accuracy: 76.27
mmlu-stem_accuracy: 77.75
mmlu-social-science_accuracy: 78.65
mmlu-humanities_accuracy: 73.12
mmlu-other_accuracy: 75.05
cmmlu_accuracy: 75.9
cmmlu-stem_accuracy: 73.41
cmmlu-social-science_accuracy: 75.97
cmmlu-humanities_accuracy: 76.42
cmmlu-other_accuracy: 78.15
cmmlu-china-specific_accuracy: 73.27
mmlu_pro_accuracy: 56.14
mmlu_pro_biology_accuracy: 72.25
mmlu_pro_business_accuracy: 66.16
mmlu_pro_chemistry_accuracy: 55.65
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 41.38
mmlu_pro_health_accuracy: 54.89
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 29.06
mmlu_pro_math_accuracy: 73.58
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 60.05
mmlu_pro_psychology_accuracy: 61.9
mmlu_pro_other_accuracy: 52.6
humanevalx-python_pass@1: 51.83
humanevalx-cpp_pass@1: 42.68
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 73.78
humanevalx-js_pass@1: 72.56
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.64
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 8.7
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 61.29
mmmlu_lite_accuracy: 54.7
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.18
openai_mmmlu_lite_DE-DE_accuracy: 60
openai_mmmlu_lite_ES-LA_accuracy: 66.18
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 48.63
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.26
openai_mmmlu_lite_JA-JP_accuracy: 60.7
openai_mmmlu_lite_KO-KR_accuracy: 60.63
openai_mmmlu_lite_PT-BR_accuracy: 54.46
openai_mmmlu_lite_SW-KE_accuracy: 36
openai_mmmlu_lite_YO-NG_accuracy: 31.86
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
college_naive_average: 48.33
high_naive_average: 59.33
middle_naive_average: 76.67
primary_naive_average: 86.67
arithmetic_naive_average: 74.33
mathbench-a (average)_naive_average: 69.07
college_knowledge_naive_average: 83.54
high_knowledge_naive_average: 80.82
middle_knowledge_naive_average: 83.79
primary_knowledge_naive_average: 92.22
mathbench-t (average)_naive_average: 85.1
internlm3-8b-instruct-turbomind:
objective:
race-high_accuracy: 89.22
ARC-c_accuracy: 92.54
BoolQ_accuracy: 86.45
triviaqa_wiki_1shot_score: 60.72
nq_open_1shot_score: 20.25
mmmlu_lite_naive_average: 41.82
IFEval_Prompt-level-strict-accuracy: 77.45
drop_accuracy: 83.27
bbh_naive_average: 55.22
GPQA_diamond_accuracy: 37.88
hellaswag_accuracy: 91.28
TheoremQA_score: 20.12
musr_average_naive_average: 36.86
korbench_single_naive_average: 41.2
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 91.28
GaokaoBench_weighted_average: 86.59
math_accuracy: 76.96
cmo_fib_accuracy: 35.1
aime2024_accuracy: 16.67
Mathbench_naive_average: 78.96
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
cmmlu_naive_average: 83.33
mmlu_naive_average: 76.21
mmlu_pro_naive_average: 57.96
openai_humaneval_humaneval_pass@1: 81.71
sanitized_mbpp_score: 69.65
humanevalx_naive_average: 40.73
ds1000_naive_average: 27.23
lcb_code_generation_pass@1: 34.75
lcb_code_execution_pass@1: 49.9
lcb_test_output_pass@1: 48.19
bigcodebench_hard_instruct_pass@1: 13.51
bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86
SciCode_sub_accuracy: 11.11
qa_dingo_cn_score: 100
mmlu_accuracy: 76.21
mmlu-stem_accuracy: 77.7
mmlu-social-science_accuracy: 80.98
mmlu-humanities_accuracy: 70.83
mmlu-other_accuracy: 75.01
cmmlu_accuracy: 83.33
cmmlu-stem_accuracy: 79.66
cmmlu-social-science_accuracy: 83.39
cmmlu-humanities_accuracy: 84.73
cmmlu-other_accuracy: 86.2
cmmlu-china-specific_accuracy: 81.77
mmlu_pro_accuracy: 57.96
mmlu_pro_biology_accuracy: 75.45
mmlu_pro_business_accuracy: 64.64
mmlu_pro_chemistry_accuracy: 59.81
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 68.6
mmlu_pro_engineering_accuracy: 44.79
mmlu_pro_health_accuracy: 58.31
mmlu_pro_history_accuracy: 49.87
mmlu_pro_law_accuracy: 32.43
mmlu_pro_math_accuracy: 70.17
mmlu_pro_philosophy_accuracy: 46.89
mmlu_pro_physics_accuracy: 59.58
mmlu_pro_psychology_accuracy: 66.29
mmlu_pro_other_accuracy: 54.33
humanevalx-python_pass@1: 43.9
humanevalx-cpp_pass@1: 20.12
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 65.24
ds1000_Pandas_accuracy: 16.49
ds1000_Numpy_accuracy: 34.09
ds1000_Tensorflow_accuracy: 26.67
ds1000_Scipy_accuracy: 17.92
ds1000_Sklearn_accuracy: 20.87
ds1000_Pytorch_accuracy: 19.12
ds1000_Matplotlib_accuracy: 55.48
mmmlu_lite_accuracy: 41.82
openai_mmmlu_lite_AR-XY_accuracy: 32.56
openai_mmmlu_lite_BN-BD_accuracy: 4.56
openai_mmmlu_lite_DE-DE_accuracy: 24.91
openai_mmmlu_lite_ES-LA_accuracy: 51.09
openai_mmmlu_lite_FR-FR_accuracy: 61.68
openai_mmmlu_lite_HI-IN_accuracy: 24.98
openai_mmmlu_lite_ID-ID_accuracy: 44.56
openai_mmmlu_lite_IT-IT_accuracy: 52.35
openai_mmmlu_lite_JA-JP_accuracy: 51.02
openai_mmmlu_lite_KO-KR_accuracy: 47.93
openai_mmmlu_lite_PT-BR_accuracy: 53.89
openai_mmmlu_lite_SW-KE_accuracy: 33.47
openai_mmmlu_lite_YO-NG_accuracy: 33.47
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
college_naive_average: 45.67
high_naive_average: 64.67
middle_naive_average: 82.33
primary_naive_average: 90.33
arithmetic_naive_average: 74
mathbench-a (average)_naive_average: 71.4
college_knowledge_naive_average: 85.28
high_knowledge_naive_average: 79.43
middle_knowledge_naive_average: 87.9
primary_knowledge_naive_average: 93.42
mathbench-t (average)_naive_average: 86.51
internlm3-8b-instruct-pytorch:
objective:
race-high_accuracy: 89.02
ARC-c_accuracy: 93.56
BoolQ_accuracy: 86.67
triviaqa_wiki_1shot_score: 60.54
nq_open_1shot_score: 20.3
mmmlu_lite_naive_average: 42.6
IFEval_Prompt-level-strict-accuracy: 79.11
drop_accuracy: 83.32
bbh_naive_average: 54.76
GPQA_diamond_accuracy: 33.84
hellaswag_accuracy: 91.31
TheoremQA_score: 18
musr_average_naive_average: 36.62
korbench_single_naive_average: 41.84
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 90.67
GaokaoBench_weighted_average: 86.27
math_accuracy: 76.68
cmo_fib_accuracy: 33.65
aime2024_accuracy: 10
Mathbench_naive_average: 78.92
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
cmmlu_naive_average: 83.11
mmlu_naive_average: 76.23
mmlu_pro_naive_average: 58.16
openai_humaneval_humaneval_pass@1: 82.32
sanitized_mbpp_score: 70.04
humanevalx_naive_average: 39.76
ds1000_naive_average: 27.84
lcb_code_generation_pass@1: 34.5
lcb_code_execution_pass@1: 48.02
lcb_test_output_pass@1: 47.74
bigcodebench_hard_instruct_pass@1: 12.84
bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86
SciCode_sub_accuracy: 9.38
qa_dingo_cn_score: 100
mmlu_accuracy: 76.23
mmlu-stem_accuracy: 78.08
mmlu-social-science_accuracy: 80.31
mmlu-humanities_accuracy: 71.38
mmlu-other_accuracy: 74.63
cmmlu_accuracy: 83.11
cmmlu-stem_accuracy: 79.42
cmmlu-social-science_accuracy: 83.34
cmmlu-humanities_accuracy: 83.95
cmmlu-other_accuracy: 86.22
cmmlu-china-specific_accuracy: 81.5
mmlu_pro_accuracy: 58.16
mmlu_pro_biology_accuracy: 74.62
mmlu_pro_business_accuracy: 65.02
mmlu_pro_chemistry_accuracy: 60.69
mmlu_pro_computer_science_accuracy: 61.46
mmlu_pro_economics_accuracy: 68.25
mmlu_pro_engineering_accuracy: 45.3
mmlu_pro_health_accuracy: 60.15
mmlu_pro_history_accuracy: 50.66
mmlu_pro_law_accuracy: 31.7
mmlu_pro_math_accuracy: 70.32
mmlu_pro_philosophy_accuracy: 47.7
mmlu_pro_physics_accuracy: 59.51
mmlu_pro_psychology_accuracy: 65.41
mmlu_pro_other_accuracy: 53.46
humanevalx-python_pass@1: 42.68
humanevalx-cpp_pass@1: 19.51
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 72.56
humanevalx-js_pass@1: 64.02
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 35
ds1000_Tensorflow_accuracy: 24.44
ds1000_Scipy_accuracy: 20.75
ds1000_Sklearn_accuracy: 21.74
ds1000_Pytorch_accuracy: 22.06
ds1000_Matplotlib_accuracy: 56.77
mmmlu_lite_accuracy: 42.6
openai_mmmlu_lite_AR-XY_accuracy: 32.84
openai_mmmlu_lite_BN-BD_accuracy: 10.46
openai_mmmlu_lite_DE-DE_accuracy: 24.56
openai_mmmlu_lite_ES-LA_accuracy: 50.95
openai_mmmlu_lite_FR-FR_accuracy: 61.05
openai_mmmlu_lite_HI-IN_accuracy: 30.6
openai_mmmlu_lite_ID-ID_accuracy: 45.89
openai_mmmlu_lite_IT-IT_accuracy: 51.79
openai_mmmlu_lite_JA-JP_accuracy: 51.65
openai_mmmlu_lite_KO-KR_accuracy: 48.77
openai_mmmlu_lite_PT-BR_accuracy: 52.7
openai_mmmlu_lite_SW-KE_accuracy: 32.91
openai_mmmlu_lite_YO-NG_accuracy: 32.84
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
college_naive_average: 47
high_naive_average: 66.67
middle_naive_average: 81.67
primary_naive_average: 89.33
arithmetic_naive_average: 73.67
mathbench-a (average)_naive_average: 71.67
college_knowledge_naive_average: 82.91
high_knowledge_naive_average: 79.86
middle_knowledge_naive_average: 88.92
primary_knowledge_naive_average: 92.96
mathbench-t (average)_naive_average: 86.16

View File

@ -1,27 +1,30 @@
chat:
glm-4-9b-chat-hf:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
glm-4-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 90.62
glm-4-9b-chat-vllm:
gsm8k_accuracy: 65.62
gsm8k_accuracy: 71.88
race-high_accuracy: 90.62
deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88
race-high_accuracy: 81.25
deepseek-moe-16b-chat-hf:
gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-r1-distill-llama-8b-turbomind:
gsm8k_accuracy: 31.25
race-high_accuracy: 81.25
deepseek-r1-distill-qwen-1_5b-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 53.12
deepseek-7b-chat-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 75
race-high_accuracy: 78.12
gemma2-2b-it-hf:
gsm8k_accuracy: 50
race-high_accuracy: 71.88
race-high_accuracy: 75
gemma2-9b-it-hf:
gsm8k_accuracy: 71.88
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
gemma-2b-it-hf:
gsm8k_accuracy: 3.12
@ -36,34 +39,40 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 93.75
gemma-7b-it-vllm:
gsm8k_accuracy: 34.38
gsm8k_accuracy: 31.25
race-high_accuracy: 68.75
internlm2_5-7b-chat-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm3-8b-instruct-hf:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.5
internlm2_5-7b-chat-turbomind:
gsm8k_accuracy: 87.50
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm2-chat-1.8b-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 84.38
internlm2-chat-1.8b-sft-turbomind:
gsm8k_accuracy: 21.88
gsm8k_accuracy: 31.25
race-high_accuracy: 84.38
internlm2-chat-7b-lmdeploy:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 59.38
race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
internlm2-chat-7b-vllm:
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
race-high_accuracy: 90.62
internlm3-8b-instruct-turbomind:
gsm8k_accuracy: 68.75
race-high_accuracy: 87.5
internlm2-chat-7b-vllm:
gsm8k_accuracy: 59.38
race-high_accuracy: 87.50
llama-3_1-8b-instruct-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
llama-3_2-3b-instruct-hf:
gsm8k_accuracy: 65.62
gsm8k_accuracy: 71.88
race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
gsm8k_accuracy: 68.75
@ -72,14 +81,14 @@ chat:
gsm8k_accuracy: 18.75
race-high_accuracy: 46.88
llama-3_1-8b-instruct-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 62.50
gsm8k_accuracy: 68.75
race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 87.5
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
mistral-7b-instruct-v0.2-hf:
gsm8k_accuracy: 40.62
race-high_accuracy: 75
@ -90,17 +99,14 @@ chat:
gsm8k_accuracy: 75
race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.50
gsm8k_accuracy: 71.88
race-high_accuracy: 78.12
mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
race-high_accuracy: 65.62
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 75
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
gsm8k_accuracy: 21.88
race-high_accuracy: 78.12
qwen2.5-0.5b-instruct-hf:
gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
@ -108,10 +114,10 @@ chat:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
qwen2.5-0.5b-instruct-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 50
gsm8k_accuracy: 31.25
race-high_accuracy: 43.75
qwen2.5-3b-instruct-turbomind:
gsm8k_accuracy: 59.38
gsm8k_accuracy: 56.25
race-high_accuracy: 90.62
qwen1.5-0.5b-chat-hf:
gsm8k_accuracy: 0
@ -123,11 +129,11 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
qwen2-1.5b-instruct-turbomind:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
qwen2-7b-instruct-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
race-high_accuracy: 87.50
qwen1.5-0.5b-chat-vllm:
gsm8k_accuracy: 3.12
race-high_accuracy: 53.12
@ -143,11 +149,11 @@ chat:
yi-1.5-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf:
gsm8k_accuracy: 46.88
deepseek-v2_lite-chat-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 71.88
gemma2-27b-it-hf:
gsm8k_accuracy: 75
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
internlm2_5-20b-chat-hf:
gsm8k_accuracy: 84.38
@ -161,6 +167,9 @@ chat:
mistral-small-instruct-2409-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
phi-4:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
qwen2.5-14b-instruct-hf:
gsm8k_accuracy: 71.88
race-high_accuracy: 96.88
@ -168,40 +177,41 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 93.75
yi-1.5-34b-chat-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 75.00
race-high_accuracy: 93.75
deepseek-67b-chat-hf:
gsm8k_accuracy: 71.88
deepseek-67b-chat-turbomind:
gsm8k_accuracy: 75.00
race-high_accuracy: 78.12
deepseek-r1-distill-qwen-32b-turbomind:
gsm8k_accuracy: 25
race-high_accuracy: 90.62
llama-3_3-70b-instruct-turbomind:
gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
mixtral-8x7b-instruct-v0.1-hf:
gsm8k_accuracy: 56.25
race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
gsm8k_accuracy: 90.62
gsm8k_accuracy: 87.50
race-high_accuracy: 93.75
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
gsm8k_accuracy: 87.5
race-high_accuracy: 46.88
gsm8k_accuracy: 93.75
race-high_accuracy: 50.00
qwen2.5-72b-instruct-turbomind:
gsm8k_accuracy: 75
race-high_accuracy: 93.75
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
deepseek-r1-distill-llama-70b-turbomind:
gsm8k_accuracy: 40.62
race-high_accuracy: 90.62
deepseek-v2_5-1210-turbomind:
gsm8k_accuracy: 90.62
race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 81.25
mixtral-8x22b-instruct-v0.1-turbomind:
gsm8k_accuracy: 78.12
race-high_accuracy: 78.12
mixtral-8x22b-instruct-v0.1-vllm:
gsm8k_accuracy: 78.12
race-high_accuracy: 78.12
base:
glm-4-9b-hf:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
glm-4-9b-turbomind:
gsm8k_accuracy: 62.5
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
@ -210,15 +220,10 @@ base:
GPQA_diamond_accuracy: 0
race-high_accuracy: 46.88
winogrande_accuracy: 71.88
deepseek-moe-16b-base-hf:
gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 0
race-high_accuracy: 21.88
winogrande_accuracy: 65.62
deepseek-7b-base-turbomind:
gsm8k_accuracy: 21.88
gsm8k_accuracy: 18.75
GPQA_diamond_accuracy: 0
race-high_accuracy: 46.88
race-high_accuracy: 43.75
winogrande_accuracy: 84.38
deepseek-moe-16b-base-vllm:
gsm8k_accuracy: 21.88
@ -226,35 +231,40 @@ base:
race-high_accuracy: 25
winogrande_accuracy: 68.75
gemma2-2b-hf:
gsm8k_accuracy: 28.12
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 56.25
winogrande_accuracy: 71.88
winogrande_accuracy: 75.00
gemma2-9b-hf:
gsm8k_accuracy: 68.75
gsm8k_accuracy: 75.00
GPQA_diamond_accuracy: 0
race-high_accuracy: 81.25
winogrande_accuracy: 84.38
race-high_accuracy: 84.38
winogrande_accuracy: 81.25
gemma-2b-hf:
gsm8k_accuracy: 18.75
gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 25
race-high_accuracy: 21.88
winogrande_accuracy: 53.12
gemma-7b-hf:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 6.25
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 65.62
winogrande_accuracy: 78.12
winogrande_accuracy: 71.88
gemma-2-9b-turbomind:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 0
race-high_accuracy: 18.75
winogrande_accuracy: 46.88
gemma-2b-vllm:
gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12
race-high_accuracy:
winogrande_accuracy:
race-high_accuracy: 28.12
winogrande_accuracy: 68.75
gemma-7b-vllm:
gsm8k_accuracy: 53.12
GPQA_diamond_accuracy: 9.38
race-high_accuracy:
winogrande_accuracy:
gsm8k_accuracy: 43.75
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 81.25
winogrande_accuracy: 81.25
internlm2_5-7b-hf:
gsm8k_accuracy: 37.5
GPQA_diamond_accuracy: 25
@ -265,31 +275,26 @@ base:
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 62.5
winogrande_accuracy: 78.12
internlm2-base-7b-hf:
gsm8k_accuracy: 3.12
GPQA_diamond_accuracy: 21.88
race-high_accuracy: 75
winogrande_accuracy: 65.62
internlm2-1.8b-turbomind:
gsm8k_accuracy: 12.5
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 6.25
GPQA_diamond_accuracy: 12.5
race-high_accuracy: 71.88
winogrande_accuracy: 78.12
internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 93.75
winogrande_accuracy: 87.50
internlm2-7b-turbomind:
gsm8k_accuracy: 53.12
GPQA_diamond_accuracy: 21.88
race-high_accuracy: 71.88
winogrande_accuracy: 84.38
internlm2-base-7b-turbomind:
gsm8k_accuracy: 37.50
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 81.25
winogrande_accuracy: 75
internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
internlm2-7b-turbomind:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 78.12
winogrande_accuracy: 71.88
internlm2-base-7b-turbomind:
gsm8k_accuracy: 28.12
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 71.88
winogrande_accuracy: 62.50
llama-2-7b-hf:
gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 21.88
@ -306,15 +311,15 @@ base:
race-high_accuracy: 65.62
winogrande_accuracy: 65.62
llama-3.1-8b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 78.12
winogrande_accuracy: 78.12
llama-3-8b-turbomind:
gsm8k_accuracy: 50
gsm8k_accuracy: 46.88
GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62
winogrande_accuracy: 78.12
winogrande_accuracy: 81.25
mistral-7b-v0.3-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25
@ -326,15 +331,15 @@ base:
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
qwen2.5-1.5b-turbomind:
gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 12.50
race-high_accuracy: 78.12
winogrande_accuracy: 68.75
qwen2.5-7b-turbomind:
gsm8k_accuracy: 75.00
GPQA_diamond_accuracy: 25
race-high_accuracy: 87.5
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 75
winogrande_accuracy: 71.88
qwen2.5-7b-turbomind:
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 87.5
winogrande_accuracy: 75.00
qwen1.5-moe-a2.7b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 18.75
@ -356,20 +361,20 @@ base:
race-high_accuracy: 87.5
winogrande_accuracy: 68.75
qwen2-1.5b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 81.25
winogrande_accuracy: 75
qwen2-7b-turbomind:
gsm8k_accuracy: 75.00
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 12.5
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
winogrande_accuracy: 75
qwen1.5-0.5b-vllm:
gsm8k_accuracy: 9.38
GPQA_diamond_accuracy: 0
race-high_accuracy: 56.25
winogrande_accuracy: 62.5
winogrande_accuracy: 59.38
yi-1.5-6b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
@ -384,25 +389,10 @@ base:
gsm8k_accuracy: 78.12
GPQA_diamond_accuracy: 40.62
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
deepseek-v2-lite-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 59.38
winogrande_accuracy: 71.88
internlm2-20b-hf:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 68.75
winogrande_accuracy: 75
internlm2-base-20b-hf:
gsm8k_accuracy: 12.5
GPQA_diamond_accuracy: 9.38
race-high_accuracy: 84.38
winogrande_accuracy: 65.62
internlm2-20b-turbomind:
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 15.62
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 68.75
winogrande_accuracy: 81.25
qwen2.5-14b-hf:
@ -420,33 +410,23 @@ base:
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75
winogrande_accuracy: 81.25
deepseek-67b-base-hf:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 81.25
winogrande_accuracy: 90.62
deepseek-67b-base-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 81.25
winogrande_accuracy: 84.38
llama-3-70b-turbomind:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 9.38
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 78.12
winogrande_accuracy: 81.25
llama-3-70b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
qwen2.5-72b-turbomind:
gsm8k_accuracy: 84.38
GPQA_diamond_accuracy: 34.38
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
deepseek-v2-turbomind:
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 81.25
winogrande_accuracy: 75
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
gsm8k_accuracy: 65.62
GPQA_diamond_accuracy: 9.38
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
winogrande_accuracy: 81.25

View File

@ -17,7 +17,7 @@ on:
required: false
description: 'whether to build lmdeploy'
type: boolean
default: false
default: true
repo_org_lmdeploy:
required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -44,7 +44,7 @@ on:
type: string
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
schedule:
- cron: '15 14 * * *'
- cron: '15 14 * * 0,3'
env:
HF_DATASETS_OFFLINE: 1
@ -61,6 +61,7 @@ env:
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
CONDA_ENV: regression_test
export VLLM_WORKER_MULTIPROC_METHOD: spawn
jobs:
build-pypi:
@ -87,12 +88,11 @@ jobs:
name: my-artifact-${{ github.run_id }}
build-pypi-lmdeploy:
if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
strategy:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
environment: 'prod'
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
@ -126,8 +126,7 @@ jobs:
if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy']
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 240 #4hours
timeout-minutes: 120 #2hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -148,7 +147,7 @@ jobs:
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 240
timeout_minutes: 120
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda create -y --name ${{env.CONDA_ENV}} python=3.10
@ -157,20 +156,23 @@ jobs:
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall -y lmdeploy
pip install lmdeploy-*.whl --no-deps
- name: conda env
run: |
@ -187,8 +189,7 @@ jobs:
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
runs-on: volc_cu12_daily
environment: 'prod'
timeout-minutes: 120 #2hours
timeout-minutes: 180 #3hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -210,7 +211,7 @@ jobs:
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 120
timeout_minutes: 180
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
@ -228,8 +229,7 @@ jobs:
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 240 #4hours
timeout-minutes: 480 #6hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -255,27 +255,33 @@ jobs:
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api
if: matrix.regression_func == 'api'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
sleep 180s
env | grep PROXY
env | grep proxy
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -304,8 +310,7 @@ jobs:
matrix:
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 360 #6hours
timeout-minutes: 480 #6hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -322,7 +327,7 @@ jobs:
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 360
timeout_minutes: 480
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
@ -334,11 +339,10 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
timeout-minutes: 5
runs-on: self-hosted
environment: 'prod'
steps:
- name: notify
run: |

View File

@ -45,7 +45,7 @@ jobs:
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: conda env
run: |

View File

@ -20,7 +20,7 @@ jobs:
matrix:
python-version: ['3.10']
include:
- torch: 2.0.0
- torch: 2.5.1
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
@ -30,7 +30,7 @@ jobs:
- name: Upgrade pip
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install system dependencies
run: |
sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
@ -106,7 +106,7 @@ jobs:
- name: Upgrade pip
run: python -m pip install pip --upgrade
- name: Install PyTorch
run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install opencompass dependencies
run: |
pip install -r requirements.txt

View File

@ -120,4 +120,4 @@ repos:
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
# args: ["mmocr", "tests", "tools"] # these directories will be checked

View File

@ -120,4 +120,4 @@ repos:
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
# args: ["mmocr", "tests", "tools"] # these directories will be checked

362
README.md
View File

@ -57,6 +57,10 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
@ -173,69 +177,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
- Your first evaluation with OpenCompass!
### Your first evaluation with OpenCompass!
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
# Python scripts
opencompass examples/eval_chat_demo.py
```
# Python scripts
opencompass examples/eval_chat_demo.py
```
You can find more script examples under [examples](./examples) folder.
You can find more script examples under [examples](./examples) folder.
- API evaluation
### API evaluation
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# CLI
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# CLI
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python scripts
opencompass examples/eval_api_demo.py
# Python scripts
opencompass examples/eval_api_demo.py
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
```
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
```
- Accelerated Evaluation
### Accelerated Evaluation
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
# Python scripts
opencompass examples/eval_lmdeploy_demo.py
```
# Python scripts
opencompass examples/eval_lmdeploy_demo.py
```
- Supported Models
### Supported Models and Datasets
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
```bash
# List all configurations
python tools/list_configs.py
# List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu
```
```bash
# List all configurations
python tools/list_configs.py
# List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu
```
If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
#### Supported Models
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
```
#### Supported Datasets
Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
```bash
# Recommended Evaluation Config based on Rules
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# Recommended Evaluation Config based on LLM Judge
opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
```
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
```
> \[!TIP\]
>
@ -279,263 +297,15 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
## 📖 Dataset Support
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>Language</b>
</td>
<td>
<b>Knowledge</b>
</td>
<td>
<b>Reasoning</b>
</td>
<td>
<b>Examination</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>Word Definition</b></summary>
We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
- WiC
- SummEdits
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
</details>
In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
<details open>
<summary><b>Idiom Learning</b></summary>
Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
- CHID
</details>
<details open>
<summary><b>Semantic Similarity</b></summary>
- AFQMC
- BUSTM
</details>
<details open>
<summary><b>Coreference Resolution</b></summary>
- CLUEWSC
- WSC
- WinoGrande
</details>
<details open>
<summary><b>Translation</b></summary>
- Flores
- IWSLT2017
</details>
<details open>
<summary><b>Multi-language Question Answering</b></summary>
- TyDi-QA
- XCOPA
</details>
<details open>
<summary><b>Multi-language Summary</b></summary>
- XLSum
</details>
</td>
<td>
<details open>
<summary><b>Knowledge Question Answering</b></summary>
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
</details>
</td>
<td>
<details open>
<summary><b>Textual Entailment</b></summary>
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
</details>
<details open>
<summary><b>Commonsense Reasoning</b></summary>
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
</details>
<details open>
<summary><b>Mathematical Reasoning</b></summary>
- MATH
- GSM8K
</details>
<details open>
<summary><b>Theorem Application</b></summary>
- TheoremQA
- StrategyQA
- SciBench
</details>
<details open>
<summary><b>Comprehensive Reasoning</b></summary>
- BBH
</details>
</td>
<td>
<details open>
<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
</details>
<details open>
<summary><b>Medical Examinations</b></summary>
- CMB
</details>
</td>
</tr>
</td>
</tr>
</tbody>
<tbody>
<tr align="center" valign="bottom">
<td>
<b>Understanding</b>
</td>
<td>
<b>Long Context</b>
</td>
<td>
<b>Safety</b>
</td>
<td>
<b>Code</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>Reading Comprehension</b></summary>
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
</details>
<details open>
<summary><b>Content Summary</b></summary>
- CSL
- LCSTS
- XSum
- SummScreen
</details>
<details open>
<summary><b>Content Analysis</b></summary>
- EPRSTMT
- LAMBADA
- TNEWS
</details>
</td>
<td>
<details open>
<summary><b>Long Context Understanding</b></summary>
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
</details>
</td>
<td>
<details open>
<summary><b>Safety</b></summary>
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
</details>
<details open>
<summary><b>Robustness</b></summary>
- AdvGLUE
</details>
</td>
<td>
<details open>
<summary><b>Code</b></summary>
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
</details>
</td>
</tr>
</td>
</tr>
</tbody>
</table>
<p align="right"><a href="#top">🔝Back to top</a></p>
## 📖 Model Support

View File

@ -57,6 +57,10 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
- **\[2025.02.15\]** 我们新增了两个实用的评测工具用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。
- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥
@ -205,9 +209,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
```
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
- ### 支持的模型与数据集
- ### 支持的模型
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
```bash
# 列出所有配置
@ -216,13 +220,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
python tools/list_configs.py llama mmlu
```
如果模型不在列表中但支持 Huggingface AutoModel 类,您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
#### 支持的模型
如果模型不在列表中,但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装(详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)),您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
#### 支持的数据集
目前OpenCompass针对数据集给出了标准的推荐配置。通常`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
```bash
# 基于规则的推荐配置
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# 基于LLM Judge的推荐配置
opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
```
此外,如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@ -274,263 +292,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
## 📖 数据集支持
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>语言</b>
</td>
<td>
<b>知识</b>
</td>
<td>
<b>推理</b>
</td>
<td>
<b>考试</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>字词释义</b></summary>
我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
- WiC
- SummEdits
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
</details>
<details open>
<summary><b>成语习语</b></summary>
- CHID
</details>
<details open>
<summary><b>语义相似度</b></summary>
- AFQMC
- BUSTM
</details>
<details open>
<summary><b>指代消解</b></summary>
- CLUEWSC
- WSC
- WinoGrande
</details>
<details open>
<summary><b>翻译</b></summary>
- Flores
- IWSLT2017
</details>
<details open>
<summary><b>多语种问答</b></summary>
- TyDi-QA
- XCOPA
</details>
<details open>
<summary><b>多语种总结</b></summary>
- XLSum
</details>
</td>
<td>
<details open>
<summary><b>知识问答</b></summary>
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
</details>
</td>
<td>
<details open>
<summary><b>文本蕴含</b></summary>
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
</details>
<details open>
<summary><b>常识推理</b></summary>
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
</details>
<details open>
<summary><b>数学推理</b></summary>
- MATH
- GSM8K
</details>
<details open>
<summary><b>定理应用</b></summary>
- TheoremQA
- StrategyQA
- SciBench
</details>
<details open>
<summary><b>综合推理</b></summary>
- BBH
</details>
</td>
<td>
<details open>
<summary><b>初中/高中/大学/职业考试</b></summary>
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
</details>
<details open>
<summary><b>医学考试</b></summary>
- CMB
</details>
</td>
</tr>
</td>
</tr>
</tbody>
<tbody>
<tr align="center" valign="bottom">
<td>
<b>理解</b>
</td>
<td>
<b>长文本</b>
</td>
<td>
<b>安全</b>
</td>
<td>
<b>代码</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>阅读理解</b></summary>
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
</details>
<details open>
<summary><b>内容总结</b></summary>
- CSL
- LCSTS
- XSum
- SummScreen
</details>
<details open>
<summary><b>内容分析</b></summary>
- EPRSTMT
- LAMBADA
- TNEWS
</details>
</td>
<td>
<details open>
<summary><b>长文本理解</b></summary>
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
</details>
</td>
<td>
<details open>
<summary><b>安全</b></summary>
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
</details>
<details open>
<summary><b>健壮性</b></summary>
- AdvGLUE
</details>
</td>
<td>
<details open>
<summary><b>代码</b></summary>
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
</details>
</td>
</tr>
</td>
</tr>
</tbody>
</table>
详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
<p align="right"><a href="#top">🔝返回顶部</a></p>

999
dataset-index.yml Normal file
View File

@ -0,0 +1,999 @@
- ifeval:
name: IFEval
category: Instruction Following
paper: https://arxiv.org/pdf/2311.07911
configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
configpath_llmjudge: ''
- nphard:
name: NPHardEval
category: Reasoning
paper: https://arxiv.org/pdf/2312.14890v2
configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
configpath_llmjudge: ''
- pmmeval:
name: PMMEval
category: Language
paper: https://arxiv.org/pdf/2411.09116v1
configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
configpath_llmjudge: ''
- theoremqa:
name: TheroremQA
category: Reasoning
paper: https://arxiv.org/pdf/2305.12524
configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
configpath_llmjudge: ''
- agieval:
name: AGIEval
category: Examination
paper: https://arxiv.org/pdf/2304.06364
configpath: opencompass/configs/datasets/agieval/agieval_gen.py
configpath_llmjudge: ''
- babilong:
name: BABILong
category: Long Context
paper: https://arxiv.org/pdf/2406.10149
configpath: opencompass/configs/datasets/babilong
configpath_llmjudge: ''
- bigcodebench:
name: BigCodeBench
category: Code
paper: https://arxiv.org/pdf/2406.15877
configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
configpath_llmjudge: ''
- calm:
name: CaLM
category: Reasoning
paper: https://arxiv.org/pdf/2405.00622
configpath: opencompass/configs/datasets/calm/calm.py
configpath_llmjudge: ''
- infinitebench:
name: InfiniteBench (∞Bench)
category: Long Context
paper: https://aclanthology.org/2024.acl-long.814.pdf
configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
configpath_llmjudge: ''
- korbench:
name: KOR-Bench
category: Reasoning
paper: https://arxiv.org/pdf/2410.06526v1
configpath: opencompass/configs/datasets/korbench/korbench_gen.py
configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
- lawbench:
name: LawBench
category: Knowledge / Law
paper: https://arxiv.org/pdf/2309.16289
configpath:
- opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
- opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
configpath_llmjudge: ''
- leval:
name: L-Eval
category: Long Context
paper: https://arxiv.org/pdf/2307.11088v1
configpath: opencompass/configs/datasets/leval/leval.py
configpath_llmjudge: ''
- livecodebench:
name: LiveCodeBench
category: Code
paper: https://arxiv.org/pdf/2403.07974
configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
configpath_llmjudge: ''
- livemathbench:
name: LiveMathBench
category: Math
paper: https://arxiv.org/pdf/2412.13147
configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
configpath_llmjudge: ''
- livereasonbench:
name: LiveReasonBench
category: Reasoning
paper: ''
configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
configpath_llmjudge: ''
- longbench:
name: LongBench
category: Long Context
paper: https://github.com/THUDM/LongBench
configpath:
- opencompass/configs/datasets/longbench/longbench.py
- opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
configpath_llmjudge: ''
- lveval:
name: LV-Eval
category: Long Context
paper: https://arxiv.org/pdf/2402.05136
configpath: opencompass/configs/datasets/lveval/lveval.py
configpath_llmjudge: ''
- mastermath2024v1:
name: Mastermath2024v1
category: Math
paper: ''
configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
configpath_llmjudge: ''
- medbench:
name: MedBench
category: Knowledge / Medicine
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: ''
- MedXpertQA:
name: MedXpertQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2501.18362
configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
- musr:
name: MuSR
category: Reasoning
paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr/musr_gen.py
configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
- needlebench:
name: NeedleBench
category: Long Context
paper: https://arxiv.org/pdf/2407.11963
configpath: opencompass/configs/datasets/needlebench
configpath_llmjudge: ''
- ruler:
name: RULER
category: Long Context
paper: https://arxiv.org/pdf/2404.06654
configpath: opencompass/configs/datasets/ruler
configpath_llmjudge: ''
- alignment:
name: AlignBench
category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.18743
configpath: opencompass/configs/datasets/subjective/alignbench
configpath_llmjudge: ''
- alpaca:
name: AlpacaEval
category: Subjective / Instruction Following
paper: https://github.com/tatsu-lab/alpaca_eval
configpath: opencompass/configs/datasets/subjective/aplaca_eval
configpath_llmjudge: ''
- arenahard:
name: Arena-Hard
category: Subjective / Chatbot
paper: https://lmsys.org/blog/2024-04-19-arena-hard/
configpath: opencompass/configs/datasets/subjective/arena_hard
configpath_llmjudge: ''
- flames:
name: FLAMES
category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.06899
configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
configpath_llmjudge: ''
- fofo:
name: FOFO
category: Subjective / Format Following
paper: https://arxiv.org/pdf/2402.18667
configpath: opencompass/configs/datasets/subjective/fofo
configpath_llmjudge: ''
- followbench:
name: FollowBench
category: Subjective / Instruction Following
paper: https://arxiv.org/pdf/2310.20410
configpath: opencompass/configs/datasets/subjective/followbench
configpath_llmjudge: ''
- hellobench:
name: HelloBench
category: Subjective / Long Context
paper: https://arxiv.org/pdf/2409.16191
configpath: opencompass/configs/datasets/subjective/hellobench
configpath_llmjudge: ''
- judgerbench:
name: JudgerBench
category: Subjective / Long Context
paper: https://arxiv.org/pdf/2410.16256
configpath: opencompass/configs/datasets/subjective/judgerbench
configpath_llmjudge: ''
- multiround:
name: MT-Bench-101
category: Subjective / Multi-Round
paper: https://arxiv.org/pdf/2402.14762
configpath: opencompass/configs/datasets/subjective/multiround
configpath_llmjudge: ''
- wildbench:
name: WildBench
category: Subjective / Real Task
paper: https://arxiv.org/pdf/2406.04770
configpath: opencompass/configs/datasets/subjective/wildbench
configpath_llmjudge: ''
- teval:
name: T-Eval
category: Tool Utilization
paper: https://arxiv.org/pdf/2312.14033
configpath:
- opencompass/configs/datasets/teval/teval_en_gen.py
- opencompass/configs/datasets/teval/teval_zh_gen.py
configpath_llmjudge: ''
- finalceiq:
name: FinanceIQ
category: Knowledge / Finance
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
configpath_llmjudge: ''
- gaokaobench:
name: GAOKAOBench
category: Examination
paper: https://arxiv.org/pdf/2305.12474
configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
configpath_llmjudge: ''
- lcbench:
name: LCBench
category: Code
paper: https://github.com/open-compass/CodeBench/
configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
configpath_llmjudge: ''
- MMLUArabic:
name: ArabicMMLU
category: Language
paper: https://arxiv.org/pdf/2402.12840
configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
configpath_llmjudge: ''
- OpenFinData:
name: OpenFinData
category: Knowledge / Finance
paper: https://github.com/open-compass/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
configpath_llmjudge: ''
- QuALITY:
name: QuALITY
category: Long Context
paper: https://arxiv.org/pdf/2112.08608
configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
configpath_llmjudge: ''
- advglue:
name: Adversarial GLUE
category: Safety
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
configpath:
- opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
configpath_llmjudge: ''
- afqmcd:
name: CLUE / AFQMC
category: Language
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
configpath_llmjudge: ''
- aime2024:
name: AIME2024
category: Examination
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
- anli:
name: Adversarial NLI
category: Reasoning
paper: https://arxiv.org/pdf/1910.14599v2
configpath: opencompass/configs/datasets/anli/anli_gen.py
configpath_llmjudge: ''
- anthropics_evals:
name: Anthropics Evals
category: Safety
paper: https://arxiv.org/pdf/2212.09251
configpath:
- opencompass/configs/datasets/anthropics_evals/airisk_gen.py
- opencompass/configs/datasets/anthropics_evals/persona_gen.py
- opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
configpath_llmjudge: ''
- apps:
name: APPS
category: Code
paper: https://arxiv.org/pdf/2105.09938
configpath:
- opencompass/configs/datasets/apps/apps_gen.py
- opencompass/configs/datasets/apps/apps_mini_gen.py
configpath_llmjudge: ''
- arc:
name: ARC
category: Reasoning
paper: https://arxiv.org/pdf/1803.05457
configpath:
- opencompass/configs/datasets/ARC_c/ARC_c_gen.py
- opencompass/configs/datasets/ARC_e/ARC_e_gen.py
configpath_llmjudge: ''
- arc_prize_public_eval:
name: ARC Prize
category: ARC-AGI
paper: https://arcprize.org/guide#private
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
configpath_llmjudge: ''
- ax:
name: SuperGLUE / AX
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath:
- opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
- opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
configpath_llmjudge: ''
- bbh:
name: BIG-Bench Hard
category: Reasoning
paper: https://arxiv.org/pdf/2210.09261
configpath: opencompass/configs/datasets/bbh/bbh_gen.py
configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
- bbeh:
name: BIG-Bench Extra Hard
category: Reasoning
paper: https://arxiv.org/abs/2502.19187
configpath: opencompass/configs/datasets/bbeh
configpath_llmjudge: ''
- BoolQ:
name: SuperGLUE / BoolQ
category: Knowledge
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
configpath_llmjudge: ''
- c3:
name: CLUE / C3 (C³)
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
configpath_llmjudge: ''
- cb:
name: SuperGLUE / CB
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
configpath_llmjudge: ''
- ceval:
name: C-EVAL
category: Examination
paper: https://arxiv.org/pdf/2305.08322v1
configpath: opencompass/configs/datasets/ceval/ceval_gen.py
configpath_llmjudge: ''
- charm:
name: CHARM
category: Reasoning
paper: https://arxiv.org/pdf/2403.14112
configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
configpath_llmjudge: ''
- chembench:
name: ChemBench
category: Knowledge / Chemistry
paper: https://arxiv.org/pdf/2404.01475
configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
configpath_llmjudge: ''
- chid:
name: FewCLUE / CHID
category: Language
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
configpath_llmjudge: ''
- chinese_simpleqa:
name: Chinese SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.07140
configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
configpath_llmjudge: ''
- cibench:
name: CIBench
category: Code
paper: https://www.arxiv.org/pdf/2407.10499
configpath:
- opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
- opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
- opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
configpath_llmjudge: ''
- civilcomments:
name: CivilComments
category: Safety
paper: https://arxiv.org/pdf/1903.04561
configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
configpath_llmjudge: ''
- clozeTest_maxmin:
name: Cloze Test-max/min
category: Code
paper: https://arxiv.org/pdf/2102.04664
configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
configpath_llmjudge: ''
- cluewsc:
name: FewCLUE / CLUEWSC
category: Language / WSC
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
configpath_llmjudge: ''
- cmb:
name: CMB
category: Knowledge / Medicine
paper: https://arxiv.org/pdf/2308.08833
configpath: opencompass/configs/datasets/cmb/cmb_gen.py
configpath_llmjudge: ''
- cmmlu:
name: CMMLU
category: Understanding
paper: https://arxiv.org/pdf/2306.09212
configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
- cmnli:
name: CLUE / CMNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
configpath_llmjudge: ''
- cmo_fib:
name: cmo_fib
category: Examination
paper: ''
configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
configpath_llmjudge: ''
- cmrc:
name: CLUE / CMRC
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
configpath_llmjudge: ''
- commonsenseqa:
name: CommonSenseQA
category: Knowledge
paper: https://arxiv.org/pdf/1811.00937v2
configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
configpath_llmjudge: ''
- commonsenseqa_cn:
name: CommonSenseQA-CN
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
configpath_llmjudge: ''
- copa:
name: SuperGLUE / COPA
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
configpath_llmjudge: ''
- crowspairs:
name: CrowsPairs
category: Safety
paper: https://arxiv.org/pdf/2010.00133
configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
configpath_llmjudge: ''
- crowspairs_cn:
name: CrowsPairs-CN
category: Safety
paper: ''
configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
configpath_llmjudge: ''
- cvalues:
name: CVALUES
category: Safety
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
configpath_llmjudge: ''
- drcd:
name: CLUE / DRCD
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
configpath_llmjudge: ''
- drop:
name: DROP (DROP Simple Eval)
category: Understanding
paper: https://arxiv.org/pdf/1903.00161
configpath: opencompass/configs/datasets/drop/drop_gen.py
configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
- ds1000:
name: DS-1000
category: Code
paper: https://arxiv.org/pdf/2211.11501
configpath:
- opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
configpath_llmjudge: ''
- eprstmt:
name: FewCLUE / EPRSTMT
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
configpath_llmjudge: ''
- flores:
name: Flores
category: Language
paper: https://aclanthology.org/D19-1632.pdf
configpath: opencompass/configs/datasets/flores/flores_gen.py
configpath_llmjudge: ''
- game24:
name: Game24
category: Math
paper: https://huggingface.co/datasets/nlile/24-game
configpath: opencompass/configs/datasets/game24/game24_gen.py
configpath_llmjudge: ''
- govrepcrs:
name: Government Report Dataset
category: Long Context
paper: https://aclanthology.org/2021.naacl-main.112.pdf
configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
configpath_llmjudge: ''
- gpqa:
name: GPQA
category: Knowledge
paper: https://arxiv.org/pdf/2311.12022v1
configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
- gsm8k:
name: GSM8K
category: Math
paper: https://arxiv.org/pdf/2110.14168v2
configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
configpath_llmjudge: ''
- gsm_hard:
name: GSM-Hard
category: Math
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
configpath_llmjudge: ''
- hle:
name: HLE(Humanity's Last Exam)
category: Reasoning
paper: https://lastexam.ai/paper
configpath: opencompass/configs/datasets/HLE/hle_gen.py
configpath_llmjudge: ''
- hellaswag:
name: HellaSwag
category: Reasoning
paper: https://arxiv.org/pdf/1905.07830
configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
- humaneval:
name: HumanEval
category: Code
paper: https://arxiv.org/pdf/2107.03374v2
configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
configpath_llmjudge: ''
- humaneval_cn:
name: HumanEval-CN
category: Code
paper: ''
configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
configpath_llmjudge: ''
- humaneval_multi:
name: Multi-HumanEval
category: Code
paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
configpath_llmjudge: ''
- humaneval_multi:
name: HumanEval+
category: Code
paper: https://arxiv.org/pdf/2305.01210
configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
configpath_llmjudge: ''
- humanevalx:
name: HumanEval-X
category: Code
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
configpath_llmjudge: ''
- hungarian_math:
name: Hungarian_Math
category: Math
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
configpath_llmjudge: ''
- iwslt2017:
name: IWSLT2017
category: Language
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
configpath_llmjudge: ''
- jigsawmultilingual:
name: JigsawMultilingual
category: Safety
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
configpath_llmjudge: ''
- lambada:
name: LAMBADA
category: Understanding
paper: https://arxiv.org/pdf/1606.06031
configpath: opencompass/configs/datasets/lambada/lambada_gen.py
configpath_llmjudge: ''
- lcsts:
name: LCSTS
category: Understanding
paper: https://aclanthology.org/D15-1229.pdf
configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
configpath_llmjudge: ''
- livestembench:
name: LiveStemBench
category: ''
paper: ''
configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
configpath_llmjudge: ''
- llm_compression:
name: LLM Compression
category: Bits Per Character (BPC)
paper: https://arxiv.org/pdf/2404.09937
configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
configpath_llmjudge: ''
- math:
name: MATH
category: Math
paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math/math_gen.py
configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
- math500:
name: MATH500
category: Math
paper: https://github.com/openai/prm800k
configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
- math401:
name: MATH 401
category: Math
paper: https://arxiv.org/pdf/2304.02015
configpath: opencompass/configs/datasets/math401/math401_gen.py
configpath_llmjudge: ''
- mathbench:
name: MathBench
category: Math
paper: https://arxiv.org/pdf/2405.12209
configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
configpath_llmjudge: ''
- mbpp:
name: MBPP
category: Code
paper: https://arxiv.org/pdf/2108.07732
configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
configpath_llmjudge: ''
- mbpp_cn:
name: MBPP-CN
category: Code
paper: ''
configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
configpath_llmjudge: ''
- mbpp_plus:
name: MBPP-PLUS
category: Code
paper: ''
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
configpath_llmjudge: ''
- mgsm:
name: MGSM
category: Language / Math
paper: https://arxiv.org/pdf/2210.03057
configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
configpath_llmjudge: ''
- mmlu:
name: MMLU
category: Understanding
paper: https://arxiv.org/pdf/2009.03300
configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
- mmlu_cf:
name: MMLU-CF
category: Understanding
paper: https://arxiv.org/pdf/2412.15194
configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
configpath_llmjudge: ''
- mmlu_pro:
name: MMLU-Pro
category: Understanding
paper: https://arxiv.org/pdf/2406.01574
configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
- mmmlu:
name: MMMLU
category: Language / Understanding
paper: https://huggingface.co/datasets/openai/MMMLU
configpath:
- opencompass/configs/datasets/mmmlu/mmmlu_gen.py
- opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
configpath_llmjudge: ''
- multirc:
name: SuperGLUE / MultiRC
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
configpath_llmjudge: ''
- multipl_e:
name: MultiPL-E
category: Code
paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/multipl_e
configpath_llmjudge: ''
- narrativeqa:
name: NarrativeQA
category: Understanding
paper: https://github.com/google-deepmind/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
configpath_llmjudge: ''
- natural_question:
name: NaturalQuestions
category: Knowledge
paper: https://github.com/google-research-datasets/natural-questions
configpath: opencompass/configs/datasets/nq/nq_gen.py
configpath_llmjudge: ''
- natural_question_cn:
name: NaturalQuestions-CN
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
configpath_llmjudge: ''
- obqa:
name: OpenBookQA
category: Knowledge
paper: https://arxiv.org/pdf/1809.02789v1
configpath: opencompass/configs/datasets/obqa/obqa_gen.py
configpath_llmjudge: ''
- olymmath:
name: OlymMATH
category: Math
paper: https://arxiv.org/abs/2503.21380
configpath: ''
configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
- piqa:
name: OpenBookQA
category: Knowledge / Physics
paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
configpath_llmjudge: ''
- py150:
name: py150
category: Code
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
configpath: opencompass/configs/datasets/py150/py150_gen.py
configpath_llmjudge: ''
- qasper:
name: Qasper
category: Long Context
paper: https://arxiv.org/pdf/2105.03011
configpath: opencompass/configs/datasets/qasper/qasper_gen.py
configpath_llmjudge: ''
- qaspercut:
name: Qasper-Cut
category: Long Context
paper: ''
configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
configpath_llmjudge: ''
- race:
name: RACE
category: Examination
paper: https://arxiv.org/pdf/1704.04683
configpath: opencompass/configs/datasets/race/race_gen.py
configpath_llmjudge: ''
- realtoxicprompts:
name: RealToxicPrompts
category: Safety
paper: https://arxiv.org/pdf/2009.11462
configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
configpath_llmjudge: ''
- record:
name: SuperGLUE / ReCoRD
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
configpath_llmjudge: ''
- rte:
name: SuperGLUE / RTE
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
configpath_llmjudge: ''
- ocnli:
name: CLUE / OCNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
configpath_llmjudge: ''
- ocnlifc:
name: FewCLUE / OCNLI-FC
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
configpath_llmjudge: ''
- rolebench:
name: RoleBench
category: Role Play
paper: https://arxiv.org/pdf/2310.00746
configpath: opencompass/configs/datasets/rolebench
configpath_llmjudge: ''
- s3eval:
name: S3Eval
category: Long Context
paper: https://aclanthology.org/2024.naacl-long.69.pdf
configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
configpath_llmjudge: ''
- scibench:
name: SciBench
category: Reasoning
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
configpath: opencompass/configs/datasets/scibench/scibench_gen.py
configpath_llmjudge: ''
- scicode:
name: SciCode
category: Code
paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode/scicode_gen.py
configpath_llmjudge: ''
- simpleqa:
name: SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.04368
configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
configpath_llmjudge: ''
- siqa:
name: SocialIQA
category: Reasoning
paper: https://arxiv.org/pdf/1904.09728
configpath: opencompass/configs/datasets/siqa/siqa_gen.py
configpath_llmjudge: ''
- squad20:
name: SQuAD2.0
category: Understanding
paper: https://arxiv.org/pdf/1806.03822
configpath: opencompass/configs/datasets/squad20/squad20_gen.py
configpath_llmjudge: ''
- storycloze:
name: StoryCloze
category: Reasoning
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
configpath_llmjudge: ''
- strategyqa:
name: StrategyQA
category: Reasoning
paper: https://arxiv.org/pdf/2101.02235
configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
configpath_llmjudge: ''
- summedits:
name: SummEdits
category: Language
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
configpath: opencompass/configs/datasets/summedits/summedits_gen.py
configpath_llmjudge: ''
- summscreen:
name: SummScreen
category: Understanding
paper: https://arxiv.org/pdf/2104.07091v1
configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
configpath_llmjudge: ''
- svamp:
name: SVAMP
category: Math
paper: https://aclanthology.org/2021.naacl-main.168.pdf
configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
configpath_llmjudge: ''
- tabmwp:
name: TabMWP
category: Math / Table
paper: https://arxiv.org/pdf/2209.14610
configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
configpath_llmjudge: ''
- taco:
name: TACO
category: Code
paper: https://arxiv.org/pdf/2312.14852
configpath: opencompass/configs/datasets/taco/taco_gen.py
configpath_llmjudge: ''
- tnews:
name: FewCLUE / TNEWS
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
configpath_llmjudge: ''
- bustm:
name: FewCLUE / BUSTM
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
configpath_llmjudge: ''
- csl:
name: FewCLUE / CSL
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
configpath_llmjudge: ''
- ocnli_fc:
name: FewCLUE / OCNLI-FC
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
configpath_llmjudge: ''
- triviaqa:
name: TriviaQA
category: Knowledge
paper: https://arxiv.org/pdf/1705.03551v2
configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
configpath_llmjudge: ''
- triviaqarc:
name: TriviaQA-RC
category: Knowledge / Understanding
paper: ''
configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
configpath_llmjudge: ''
- truthfulqa:
name: TruthfulQA
category: Safety
paper: https://arxiv.org/pdf/2109.07958v2
configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
configpath_llmjudge: ''
- tydiqa:
name: TyDi-QA
category: Language
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
configpath_llmjudge: ''
- wic:
name: SuperGLUE / WiC
category: Language
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
configpath_llmjudge: ''
- wsc:
name: SuperGLUE / WSC
category: Language / WSC
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
configpath_llmjudge: ''
- winogrande:
name: WinoGrande
category: Language / WSC
paper: https://arxiv.org/pdf/1907.10641v2
configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
configpath_llmjudge: ''
- xcopa:
name: XCOPA
category: Language
paper: https://arxiv.org/pdf/2005.00333
configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
configpath_llmjudge: ''
- xiezhi:
name: Xiezhi
category: Knowledge
paper: https://arxiv.org/pdf/2306.05783
configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
configpath_llmjudge: ''
- xlsum:
name: XLSum
category: Understanding
paper: https://arxiv.org/pdf/2106.13822v1
configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
configpath_llmjudge: ''
- xsum:
name: Xsum
category: Understanding
paper: https://arxiv.org/pdf/1808.08745
configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
configpath_llmjudge: ''
- cola:
name: GLUE / CoLA
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
configpath_llmjudge: ''
- mprc:
name: GLUE / MPRC
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
configpath_llmjudge: ''
- qqp:
name: GLUE / QQP
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
configpath_llmjudge: ''
- omni_math:
name: Omni-MATH
category: Math
paper: https://omni-math.github.io/
configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
configpath_llmjudge: ''
- wikibench:
name: WikiBench
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
configpath_llmjudge: ''
- supergpqa:
name: SuperGPQA
category: Knowledge
paper: https://arxiv.org/pdf/2502.14739
configpath: opencompass/configs/datasets/supergpqa
configpath_llmjudge: ''

View File

@ -1,10 +1,20 @@
var collapsedSections = [];
var collapsedSections = ['Dataset Statistics'];
$(document).ready(function () {
$('.model-summary').DataTable({
$('.dataset').DataTable({
"stateSave": false,
"lengthChange": false,
"pageLength": 20,
"order": []
"order": [],
"language": {
"info": "Show _START_ to _END_ ItemsTotally _TOTAL_ ",
"infoFiltered": "Filtered from _MAX_ Items",
"search": "Search",
"zeroRecords": "Item Not Found",
"paginate": {
"next": "Next",
"previous": "Previous"
},
}
});
});

View File

@ -0,0 +1,370 @@
# LLM as Judge Evaluation
## Introduction
The GenericLLMEvaluator is particularly useful for scenarios where rule-based methods (like regular expressions) cannot perfectly judge outputs, such as:
- Cases where models output answer content without option identifiers
- Factual judgment datasets that are difficult to evaluate with rules
- Open-ended responses requiring complex understanding and reasoning
- Evaluation that requires a lot of rules to be designed
OpenCompass provides the GenericLLMEvaluator component to facilitate LLM-as-judge evaluations.
## Dataset Format
The dataset for LLM judge evaluation should be in either JSON Lines (.jsonl) or CSV format. Each entry should contain at least:
- A problem or question
- A reference answer or gold standard
- (The model's prediction will be generated during evaluation)
Example JSONL format:
```json
{"problem": "What is the capital of France?", "answer": "Paris"}
```
Example CSV format:
```csv
problem,answer
"What is the capital of France?","Paris"
```
## Configuration
### Using LLM for Evaluation via Command Line
Some datasets in OpenCompass already include LLM judge configurations.
You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
```bash
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
export OC_JUDGE_API_KEY=sk-1234
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
```
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
### Using LLM for Evaluation via Configuration Files
To set up an LLM judge evaluation, you'll need to configure three main components:
1. Dataset Reader Configuration
```python
reader_cfg = dict(
input_columns=['problem'], # Column name for the question
output_column='answer' # Column name for the reference answer
)
```
2. Inference Configuration
```python
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}', # Template for prompting the model
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
```
3. Evaluation Configuration with LLM Judge
```python
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator, # Using LLM as evaluator
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE), # Template for the judge
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
),
judge_cfg=YOUR_JUDGE_MODEL_CONFIG, # Configuration for the judge model
dict_postprocessor=dict(type=generic_llmjudge_postprocess), # Post-processing the judge's output
),
)
```
## Using CustomDataset with GenericLLMEvaluator
Here's how to set up a complete configuration for LLM judge evaluation:
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
# Import your judge model configuration
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as judge_model,
)
# Define your judge template
JUDGE_TEMPLATE = """
Please evaluate whether the following response correctly answers the question.
Question: {problem}
Reference Answer: {answer}
Model Response: {prediction}
Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
""".strip()
# Dataset reader configuration
reader_cfg = dict(input_columns=['problem'], output_column='answer')
# Inference configuration for the model being evaluated
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration with LLM judge
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
),
judge_cfg=judge_model[0],
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
# Dataset configuration
datasets = [
dict(
type=CustomDataset,
abbr='my-dataset',
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
]
# Model configuration for the model being evaluated
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='model-to-evaluate',
path='path/to/your/model',
# ... other model configurations
)
]
# Output directory
work_dir = './outputs/llm_judge_eval'
```
## GenericLLMEvaluator
The GenericLLMEvaluator is designed to use an LLM as a judge for evaluating model outputs. Key features include:
1. Flexible prompt templates for instructing the judge
2. Support for various judge models (local or API-based)
3. Customizable evaluation criteria through prompt engineering
4. Post-processing of judge outputs to extract structured evaluations
**Important Note**: The current generic version of the judge template only supports outputs in the format of "A" (correct) or "B" (incorrect), and does not support other output formats (like "CORRECT" or "INCORRECT"). This is because the post-processing function `generic_llmjudge_postprocess` is specifically designed to parse this format.
The evaluator works by:
1. Taking the original problem, reference answer, and model prediction
2. Formatting them into a prompt for the judge model
3. Parsing the judge's response to determine the evaluation result (looking for "A" or "B")
4. Aggregating results across the dataset
If you would like to see the full details of evaluation results, you can add `--dump-eval-details` to the command line when you start the job.
Example evaluation output:
```python
{
'accuracy': 75.0, # Percentage of responses judged as correct
'details': [
{
'origin_prompt': """
Please evaluate whether the following response correctly answers the question.
Question: What is the capital of France?
Reference Answer: Paris
Model Response: Paris
Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
""",
'gold': 'Paris',
'prediction': 'A',
},
# ... more results
]
}
```
## CascadeEvaluator
OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
### Configuring CascadeEvaluator
Here's an example of how to configure the CascadeEvaluator:
```python
# Define a rule-based evaluator
rule_evaluator = dict(type=MATHEvaluator)
# Define an LLM judge evaluator
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=YourDataset,
path='path/to/your/dataset',
reader_cfg=reader_cfg,
),
judge_cfg=dict(), # Can use environment variables to configure the judge model
)
# Configure cascade evaluator (cascade mode)
cascade_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False # Cascade mode
)
# For parallel mode, set parallel=True
parallel_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=True # Parallel mode
)
# Use the cascade evaluator in your dataset evaluation config
eval_cfg = dict(evaluator=cascade_evaluator)
```
### Evaluation Results
The cascade evaluator outputs detailed evaluation statistics including:
- Accuracy of the rule-based evaluation
- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
- Final combined accuracy
Example output:
```python
{
'accuracy': 85.0, # Final accuracy
'cascade_stats': {
'total_samples': 100,
'rule_correct': 70, # Number of samples correct by rule evaluation
'rule_accuracy': 70.0, # Accuracy of rule evaluation
'llm_evaluated': 30, # Number of samples evaluated by LLM (failed samples in cascade mode)
'llm_correct': 15, # Number of samples correct by LLM evaluation
'llm_accuracy': 50.0, # Accuracy of LLM evaluation
'final_correct': 85, # Total correct samples
'final_accuracy': 85.0, # Final accuracy
'parallel_mode': False, # Whether parallel mode was used
},
'details': [
# Detailed evaluation results for each sample
]
}
```
The cascade evaluator is particularly useful for:
1. Scenarios that require balancing evaluation cost and accuracy
2. Cases where rule-based evaluators are available but might not be comprehensive
3. Evaluation tasks that need more nuanced judgment for edge cases
## Complete Example
For a complete working example using GenericLLMEvaluator
, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .

View File

@ -0,0 +1,190 @@
# General Math Evaluation Guidance
## Introduction
Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
## Dataset Format
The math evaluation dataset should be in either JSON Lines (.jsonl) or CSV format. Each problem should contain at least:
- A problem statement
- A solution/answer (typically in LaTeX format with the final answer in \\boxed{})
Example JSONL format:
```json
{"problem": "Find the value of x if 2x + 3 = 7", "solution": "Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"}
```
Example CSV format:
```csv
problem,solution
"Find the value of x if 2x + 3 = 7","Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"
```
## Configuration
To evaluate mathematical reasoning, you'll need to set up three main components:
1. Dataset Reader Configuration
```python
math_reader_cfg = dict(
input_columns=['problem'], # Column name for the question
output_column='solution' # Column name for the answer
)
```
2. Inference Configuration
```python
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
```
3. Evaluation Configuration
```python
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator),
)
```
## Using CustomDataset
Here's how to set up a complete configuration for math evaluation:
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
math_datasets = [
dict(
type=CustomDataset,
abbr='my-math-dataset', # Dataset abbreviation
path='path/to/your/dataset', # Path to your dataset file
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
```
## MATHEvaluator
The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
The MATHEvaluator implements:
1. Extracts answers from both predictions and references using LaTeX extraction
2. Handles various LaTeX formats and environments
3. Verifies mathematical equivalence between predicted and reference answers
4. Provides detailed evaluation results including:
- Accuracy score
- Detailed comparison between predictions and references
- Parse results of both predicted and reference answers
The evaluator supports:
- Basic arithmetic operations
- Fractions and decimals
- Algebraic expressions
- Trigonometric functions
- Roots and exponents
- Mathematical symbols and operators
Example evaluation output:
```python
{
'accuracy': 85.0, # Percentage of correct answers
'details': [
{
'predictions': 'x = 2', # Parsed prediction
'references': 'x = 2', # Parsed reference
'correct': True # Whether they match
},
# ... more results
]
}
```
## Complete Example
Here's a complete example of how to set up math evaluation:
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
# Dataset reader configuration
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
# Inference configuration
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator),
)
# Dataset configuration
math_datasets = [
dict(
type=CustomDataset,
abbr='my-math-dataset',
path='path/to/your/dataset.jsonl', # or .csv
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
# Model configuration
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='your-model-name',
path='your/model/path',
# ... other model configurations
)
]
# Output directory
work_dir = './outputs/math_eval'
```

View File

@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
return dataset
```
3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
- The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
```
- mydataset:
name: MyDataset
category: Understanding
paper: https://arxiv.org/pdf/xxxxxxx
configpath: opencompass/configs/datasets/MyDataset
```
Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.

View File

@ -0,0 +1,65 @@
# Evaluation Results Persistence
## Introduction
Normally, the evaluation results of OpenCompass will be saved to your work directory. But in some cases, there may be a need for data sharing among users or quickly browsing existing public evaluation results. Therefore, we provide an interface that can quickly transfer evaluation results to external public data stations, and on this basis, provide functions such as uploading, overwriting, and reading.
## Quick Start
### Uploading
By adding `args` to the evaluation command or adding configuration in the Eval script, the results of evaluation can be stored in the path you specify. Here are the examples:
(Approach 1) Add an `args` option to the command and specify your public path address.
```bash
opencompass ... -sp '/your_path'
```
(Approach 2) Add configuration in the Eval script.
```pythonE
station_path = '/your_path'
```
### Overwriting
The above storage method will first determine whether the same task result already exists in the data station based on the `abbr` attribute in the model and dataset configuration before uploading data. If results already exists, cancel this storage. If you need to update these results, please add the `station-overwrite` option to the command, here is an example:
```bash
opencompass ... -sp '/your_path' --station-overwrite
```
### Reading
You can directly read existing results from the data station to avoid duplicate evaluation tasks. The read results will directly participate in the 'summarize' step. When using this configuration, only tasks that do not store results in the data station will be initiated. Here is an example:
```bash
opencompass ... -sp '/your_path' --read-from-station
```
### Command Combination
1. Only upload the results under your latest working directory to the data station, without supplementing tasks that missing results:
```bash
opencompass ... -sp '/your_path' -r latest -m viz
```
## Storage Format of the Data Station
In the data station, the evaluation results are stored as `json` files for each `model-dataset` pair. The specific directory form is `/your_path/dataset_name/model_name.json `. Each `json` file stores a dictionary corresponding to the results, including `predictions`, `results`, and `cfg`, here is an example:
```pythonE
Result = {
'predictions': List[Dict],
'results': Dict,
'cfg': Dict = {
'models': Dict,
'datasets': Dict,
(Only subjective datasets)'judge_models': Dict
}
}
```
Among this three keys, `predictions` records the predictions of the model on each item of data in the dataset. `results` records the total score of the model on the dataset. `cfg` records detailed configurations of the model and the dataset in this evaluation task.

View File

@ -117,6 +117,10 @@ html_js_files = [
'js/custom.js'
]
html_context = {
'github_version': 'main',
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
@ -220,3 +224,11 @@ autodoc_typehints = 'none'
# The not found page
notfound_template = '404.html'
def builder_inited_handler(app):
subprocess.run(['./statis.py'])
def setup(app):
app.connect('builder-inited', builder_inited_handler)

View File

@ -39,8 +39,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
user_guides/evaluation.md
user_guides/experimentation.md
user_guides/metrics.md
user_guides/summarizer.md
user_guides/corebench.md
user_guides/deepseek_r1.md
.. _Prompt:
.. toctree::
@ -62,16 +61,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
advanced_guides/custom_dataset.md
advanced_guides/new_model.md
advanced_guides/evaluation_lmdeploy.md
advanced_guides/evaluation_lightllm.md
advanced_guides/accelerator_intro.md
advanced_guides/math_verify.md
advanced_guides/llm_judge.md
advanced_guides/code_eval.md
advanced_guides/code_eval_service.md
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md
advanced_guides/contamination_eval.md
advanced_guides/needleinahaystack_eval.md
advanced_guides/persistence.md
.. _Tools:
.. toctree::
@ -80,6 +76,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
tools.md
.. _Dataset List:
.. toctree::
:maxdepth: 1
:caption: Dataset List
dataset_statistics.md
.. _Notes:
.. toctree::
:maxdepth: 1

103
docs/en/statis.py Executable file
View File

@ -0,0 +1,103 @@
#! /usr/bin/env python
from pathlib import Path
import yaml
from tabulate import tabulate
OC_ROOT = Path(__file__).absolute().parents[2]
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
DATASETZOO_TEMPLATE = """\
# Dataset Statistics
On this page, we have listed all the datasets supported by OpenCompass.
You can use sorting and search functions to find the dataset you need.
We provide recommended running configurations for each dataset,
and in some datasets also offer recommended configurations based on LLM Judge.
You can quickly start evaluation tasks based on the recommended configurations.
However, please note that these configurations may be updated over time.
"""
with open('dataset_statistics.md', 'w') as f:
f.write(DATASETZOO_TEMPLATE)
load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr', 'math500'
]
def table_format(data_list):
table_format_list = []
for i in data_list:
table_format_list_sub = []
for j in i:
if j in recommanded_dataset_list:
link_token = '[link]('
else:
link_token = '[link(TBD)]('
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[link](' + i[j][index] + ')')
elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
for k in i[j][index]:
sub_list_text += (link_token + GITHUB_PREFIX + k +
') / ')
table_format_list_sub.append(sub_list_text[:-2])
else:
table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')')
else:
table_format_list_sub.append(i[j][index])
table_format_list.append(table_format_list_sub)
return table_format_list
data_format_list = table_format(data_list)
def generate_table(data_list, title=None):
with open('dataset_statistics.md', 'a') as f:
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = [
'Name', 'Category', 'Paper or Repository', 'Recommended Config',
'Recommended Config (LLM Judge)'
]
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',
stralign='center')
f.write(tabulate(data_list, header, **table_cfg))
f.write('\n```\n')
generate_table(
data_list=data_format_list,
title='## Supported Dataset List',
)

View File

@ -81,3 +81,43 @@ datasets += cmnli_datasets
Users can choose different abilities, different datasets and different evaluation methods configuration files to build the part of the dataset in the evaluation script according to their needs.
For information on how to start an evaluation task and how to evaluate self-built datasets, please refer to the relevant documents.
### Multiple Evaluations on the Dataset
In the dataset configuration, you can set the parameter `n` to perform multiple evaluations on the same dataset and return the average metrics, for example:
```python
afqmc_datasets = [
dict(
abbr="afqmc-dev",
type=AFQMCDatasetV2,
path="./data/CLUE/AFQMC/dev.json",
n=10, # Perform 10 evaluations
reader_cfg=afqmc_reader_cfg,
infer_cfg=afqmc_infer_cfg,
eval_cfg=afqmc_eval_cfg,
),
]
```
Additionally, for binary evaluation metrics (such as accuracy, pass-rate, etc.), you can also set the parameter `k` in conjunction with `n` for [G-Pass@k](http://arxiv.org/abs/2412.13147) evaluation. The formula for G-Pass@k is:
```{math}
\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right],
```
where $n$ is the number of evaluations, and $c$ is the number of times that passed or were correct out of $n$ runs. An example configuration is as follows:
```python
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
k=[2, 4], # Return results for G-Pass@2 and G-Pass@4
n=12, # 12 evaluations
...
)
]
```

View File

@ -0,0 +1,192 @@
# Tutorial for Evaluating Reasoning Models
OpenCompass provides an evaluation tutorial for DeepSeek R1 series reasoning models (mathematical datasets).
- At the model level, we recommend using the sampling approach to reduce repetitions caused by greedy decoding
- For datasets with limited samples, we employ multiple evaluation runs and take the average
- For answer validation, we utilize LLM-based verification to reduce misjudgments from rule-based evaluation
## Installation and Preparation
Please follow OpenCompass's installation guide.
## Evaluation Configuration Setup
We provide example configurations in `examples/eval_deepseek_r1.py`. Below is the configuration explanation:
### Configuration Interpretation
#### 1. Dataset and Validator Configuration
```python
# Configuration supporting multiple runs (example)
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# LLM validator configuration. Users need to deploy API services via LMDeploy/vLLM/SGLang or use OpenAI-compatible endpoints
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # Replace with actual path
key='YOUR_API_KEY', # Use real API key
openai_api_base=['http://your-api-endpoint'], # Replace with API endpoint
query_per_second=16,
batch_size=1024,
temperature=0.001,
max_out_len=16384
)
# Apply validator to all datasets
for item in datasets:
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
```
#### 2. Model Configuration
We provided an example of evaluation based on LMDeploy as the reasoning model backend, users can modify path (i.e., HF path)
```python
# LMDeploy model configuration example
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768
),
max_seq_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# Extendable 14B/32B configurations...
]
```
#### 3. Evaluation Process Configuration
```python
# Inference configuration
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
# Evaluation configuration
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
```
#### 4. Summary Configuration
```python
# Multiple runs results average configuration
summary_groups = [
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
# Other dataset average configurations...
]
summarizer = dict(
dataset_abbrs=[
['AIME2024-Aveage8', 'naive_average'],
# Other dataset metrics...
],
summary_groups=summary_groups
)
# Work directory configuration
work_dir = "outputs/deepseek_r1_reasoning"
```
## Evaluation Execution
### Scenario 1: Model loaded on 1 GPU, data evaluated by 1 worker, using a total of 1 GPU
```bash
opencompass examples/eval_deepseek_r1.py --debug --dump-eval-details
```
Evaluation logs will be output in the command line.
### Scenario 2: Model loaded on 1 GPU, data evaluated by 8 workers, using a total of 8 GPUs
You need to modify the `infer` configuration in the configuration file and set `num_worker` to 8
```python
# Inference configuration
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
```
At the same time, remove the `--debug` parameter from the evaluation command
```bash
opencompass examples/eval_deepseek_r1.py --dump-eval-details
```
In this mode, OpenCompass will use multithreading to start `$num_worker` tasks. Specific logs will not be displayed in the command line, instead, detailed evaluation logs will be shown under `$work_dir`.
### Scenario 3: Model loaded on 2 GPUs, data evaluated by 4 workers, using a total of 8 GPUs
Note that in the model configuration, `num_gpus` in `run_cfg` needs to be set to 2 (if using an inference backend, parameters such as `tp` in LMDeploy also need to be modified accordingly to 2), and at the same time, set `num_worker` in the `infer` configuration to 4
```python
models += [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=128,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
]
```
```python
# Inference configuration
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
```
### Evaluation Results
The evaluation results are displayed as follows:
```bash
dataset version metric mode deepseek-r1-distill-qwen-7b-turbomind ---------------------------------- --------- ------------- ------ --------------------------------------- MATH - - - AIME2024-Aveage8 - naive_average gen 56.25
```
## Performance Baseline
Since the model uses Sampling for decoding, and the AIME dataset size is small, there may still be a performance fluctuation of 1-3 points even when averaging over 8 evaluations.
| Model | Dataset | Metric | Value |
| ---------------------------- | -------- | -------- | ----- |
| DeepSeek-R1-Distill-Qwen-7B | AIME2024 | Accuracy | 56.3 |
| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2 |
| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2 |

View File

@ -57,7 +57,7 @@ The parameter explanation is as follows:
- `-w`: Specify the working path, default is `./outputs/default`.
- `-l`: Enable status reporting via Lark bot.
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabledevaluation under the `results` folder will include more details, such as the correctness of each sample.
- `--dump-eval-details`: Default enabledevaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。
Using run mode `-m all` as an example, the overall execution flow is as follows:

View File

@ -1,10 +1,20 @@
var collapsedSections = [];
var collapsedSections = ['数据集统计'];
$(document).ready(function () {
$('.model-summary').DataTable({
$('.dataset').DataTable({
"stateSave": false,
"lengthChange": false,
"pageLength": 20,
"order": []
"order": [],
"language": {
"info": "显示 _START_ 至 _END_ 条目(总计 _TOTAL_ ",
"infoFiltered": "(筛选自 _MAX_ 条目)",
"search": "搜索:",
"zeroRecords": "没有找到任何条目",
"paginate": {
"next": "下一页",
"previous": "上一页"
},
}
});
});

View File

@ -0,0 +1,368 @@
# LLM 作为评判器
## 简介
GenericLLMEvaluator组件特别适用于那些难以通过规则式方法如正则表达式进行完美判断的场景例如
- 模型不输出选项标识而只输出选项内容的情况
- 需要事实性判断的数据集
- 需要复杂理解和推理的开放式回答
- 需要设计大量规则的判断
OpenCompass提供了GenericLLMEvaluator组件来实现LLM作为评判器的评估。
## 数据集格式
用于LLM评判的数据集应该是JSON Lines (.jsonl)或CSV格式。每个条目至少应包含
- 问题或任务
- 参考答案或标准答案
- (模型的预测将在评估过程中生成)
JSONL格式示例
```json
{"problem": "法国的首都是什么?", "answer": "巴黎"}
```
CSV格式示例
```csv
problem,answer
"法国的首都是什么?","巴黎"
```
## 配置说明
### 基于命令行使用LLM进行评估
OpenCompass中部分数据集已经包含了LLM评判器的配置。
你需要使用一个模型服务如OpenAI或DeepSeek官方提供的API或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
然后,你可以通过以下命令设置相关评估服务的环境变量,并对模型进行评估:
```bash
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
export OC_JUDGE_API_KEY=sk-1234
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
```
注意默认情况下OpenCompass会使用这三个环境变量但如果你使用了基于配置文件的方式配置评估服务这三个环境变量将不会生效。
### 基于配置文件使用LLM进行评估
对一个数据集设置LLM评判评估你需要配置三个主要组件
1. 数据集读取配置
```python
reader_cfg = dict(
input_columns=['problem'], # 问题列的名称
output_column='answer' # 参考答案列的名称
)
```
2. 推理配置
```python
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}', # 提示模型的模板
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
```
3. 使用LLM评判器的评估配置
```python
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator, # 使用LLM作为评估器
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="你是一个负责评估模型输出正确性和质量的助手。",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE), # 评判器的模板
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
),
judge_cfg=YOUR_JUDGE_MODEL_CONFIG, # 评判模型的配置
dict_postprocessor=dict(type=generic_llmjudge_postprocess), # 处理评判器输出的后处理器
),
)
```
## 使用CustomDataset和GenericLLMEvaluator
以下是如何设置完整的LLM评判评估配置
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
# 导入评判模型配置
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as judge_model,
)
# 定义评判模板
JUDGE_TEMPLATE = """
请评估以下回答是否正确地回答了问题。
问题:{problem}
参考答案:{answer}
模型回答:{prediction}
模型回答是否正确?如果正确,请回答"A";如果不正确,请回答"B"。
""".strip()
# 数据集读取配置
reader_cfg = dict(input_columns=['problem'], output_column='answer')
# 被评估模型的推理配置
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# 使用LLM评判器的评估配置
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="你是一个负责评估模型输出正确性和质量的助手。",
)
],
round=[
dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
),
judge_cfg=judge_model[0],
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
# 数据集配置
datasets = [
dict(
type=CustomDataset,
abbr='my-dataset',
path='path/to/your/dataset',
file_name='your_dataset.jsonl',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
]
# 被评估模型的配置
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='model-to-evaluate',
path='path/to/your/model',
# ... 其他模型配置
)
]
# 输出目录
work_dir = './outputs/llm_judge_eval'
```
## GenericLLMEvaluator
GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。主要特点包括
1. 灵活的提示模板,用于指导评判器
2. 支持各种评判模型本地或基于API
3. 通过提示工程自定义评估标准
4. 对评判器输出进行后处理以提取结构化评估
**重要说明**:目前通用版本的评判模板只支持输出"A"(正确)或"B"(不正确)的格式,不支持其他输出格式(如"正确"或"不正确")。这是因为后处理函数`generic_llmjudge_postprocess`专门设计为解析这种格式。
评估器的工作原理:
1. 获取原始问题、参考答案和模型预测
2. 将它们格式化为评判模型的提示
3. 解析评判器的响应以确定评估结果(寻找"A"或"B"
4. 汇总整个数据集的结果
如果需要查看评估的详细结果,可以在启动任务时添加`--dump-eval-details`到命令行。
评估输出示例:
```python
{
'accuracy': 75.0, # 被判断为正确的回答百分比
'details': [
{
'origin_prompt': """
请评估以下回答是否正确地回答了问题。
问题:法国的首都是什么?
参考答案:巴黎
模型回答:法国的首都是巴黎。
模型回答是否正确?如果正确,请回答"A";如果不正确,请回答"B"。""",
'gold': '巴黎',
'prediction': 'A',
},
# ... 更多结果
]
}
```
## 级联评估器 (CascadeEvaluator)
OpenCompass还提供了级联评估器`CascadeEvaluator`它结合了规则式评估和LLM评估的优势。级联评估器有两种模式
1. **级联模式Cascade Mode, parallel=False**首先使用规则式评估器评估所有样本然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖从而降低评估成本和时间。
2. **并行模式Parallel Mode, parallel=True**使用规则式评估器和LLM评判器同时评估所有样本如果任何一个评估器认为样本是正确的则将该样本视为正确。这种方式可以提高评估的宽容度但可能会导致更高的成本因为所有样本都需要LLM评估。
### 配置CascadeEvaluator
以下是配置`CascadeEvaluator`的示例:
```python
# 定义规则式评估器
rule_evaluator = dict(type=MATHEvaluator)
# 定义LLM评判器
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="你是一个负责评估模型输出正确性和质量的助手。",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=YourDataset,
path='path/to/your/dataset',
reader_cfg=reader_cfg,
),
judge_cfg=dict(), # 可以使用环境变量配置评判模型
)
# 配置级联评估器(级联模式)
cascade_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False # 级联模式
)
# 如果需要并行模式可以设置parallel=True
parallel_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=True # 并行模式
)
# 在数据集评估配置中使用级联评估器
eval_cfg = dict(evaluator=cascade_evaluator)
```
### 评估结果
级联评估器会输出详细的评估统计信息,包括:
- 规则评估的准确率
- LLM评估的准确率针对规则评估失败的样本
- 最终的综合准确率
输出示例:
```python
{
'accuracy': 85.0, # 最终准确率
'cascade_stats': {
'total_samples': 100,
'rule_correct': 70, # 规则评估认为正确的样本数
'rule_accuracy': 70.0, # 规则评估的准确率
'llm_evaluated': 30, # LLM评估的样本数级联模式下为规则评估失败的样本数
'llm_correct': 15, # LLM评估认为正确的样本数
'llm_accuracy': 50.0, # LLM评估的准确率
'final_correct': 85, # 最终正确的样本数
'final_accuracy': 85.0, # 最终准确率
'parallel_mode': False, # 是否是并行模式
},
'details': [
# 每个样本的详细评估结果
]
}
```
级联评估器特别适用于:
1. 需要平衡评估成本和准确性的场景
2. 有可用的规则式评估器但可能不够完善的情况
3. 需要对边界情况进行更精确判断的评估任务
## 完整示例
如果希望了解通用LLM评判器请参考examples目录中的`eval_llm_judge.py`文件该示例展示了如何使用LLM评判器评估数学问题。
如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件,该示例展示了如何使用级联评估器评估数学问题。

View File

@ -0,0 +1,190 @@
# 数学能力评测
## 简介
数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
## 数据集格式
数学评测数据集应该是 JSON Lines (.jsonl) 或 CSV 格式。每个问题至少应包含:
- 问题陈述
- 解答/答案(通常使用 LaTeX 格式,最终答案需要用 \\boxed{} 括起来)
JSONL 格式示例:
```json
{"problem": "求解方程 2x + 3 = 7", "solution": "让我们逐步解决:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此\\boxed{2}"}
```
CSV 格式示例:
```csv
problem,solution
"求解方程 2x + 3 = 7","让我们逐步解决:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此\\boxed{2}"
```
## 配置说明
要进行数学推理评测,你需要设置三个主要组件:
1. 数据集读取配置
```python
math_reader_cfg = dict(
input_columns=['problem'], # 问题列的名称
output_column='solution' # 答案列的名称
)
```
2. 推理配置
```python
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\n请逐步推理并将最终答案放在 \\boxed{} 中。',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
```
3. 评测配置
```python
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator),
)
```
## 使用 CustomDataset
以下是如何设置完整的数学评测配置:
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
math_datasets = [
dict(
type=CustomDataset,
abbr='my-math-dataset', # 数据集简称
path='path/to/your/dataset', # 数据集文件路径
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
```
## MATHEvaluator
MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发,该库提供了数学表达式解析和验证功能,支持 LaTeX 和一般表达式的提取与等价性验证。
MATHEvaluator 具有以下功能:
1. 使用 LaTeX 提取器从预测和参考答案中提取答案
2. 处理各种 LaTeX 格式和环境
3. 验证预测答案和参考答案之间的数学等价性
4. 提供详细的评测结果,包括:
- 准确率分数
- 预测和参考答案的详细比较
- 预测和参考答案的解析结果
评测器支持:
- 基本算术运算
- 分数和小数
- 代数表达式
- 三角函数
- 根式和指数
- 数学符号和运算符
评测输出示例:
```python
{
'accuracy': 85.0, # 正确答案的百分比
'details': [
{
'predictions': 'x = 2', # 解析后的预测答案
'references': 'x = 2', # 解析后的参考答案
'correct': True # 是否匹配
},
# ... 更多结果
]
}
```
## 完整示例
以下是设置数学评测的完整示例:
```python
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.datasets import CustomDataset
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
# 数据集读取配置
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
# 推理配置
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\n请逐步推理并将最终答案放在 \\boxed{} 中。',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# 评测配置
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator),
)
# 数据集配置
math_datasets = [
dict(
type=CustomDataset,
abbr='my-math-dataset',
path='path/to/your/dataset.jsonl', # 或 .csv
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
# 模型配置
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='your-model-name',
path='your/model/path',
# ... 其他模型配置
)
]
# 输出目录
work_dir = './outputs/math_eval'
```

View File

@ -91,4 +91,16 @@
return dataset
```
3. 在完成数据集脚本和配置文件的构建后需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息以使其加入OpenCompass官网Doc的数据集统计列表中。
- 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下:
```
- mydataset:
name: MyDataset
category: Understanding
paper: https://arxiv.org/pdf/xxxxxxx
configpath: opencompass/configs/datasets/MyDataset
```
详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程,启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。

View File

@ -0,0 +1,65 @@
# 评测结果持久化
## 介绍
通常情况下OpenCompass的评测结果将会保存到工作目录下。 但在某些情况下,可能会产生用户间的数据共享,以及快速查看已有的公共评测结果等需求。 因此,我们提供了一个能够将评测结果快速转存到外部公共数据站的接口,并且在此基础上提供了对数据站的上传、更新、读取等功能。
## 快速开始
### 向数据站存储数据
通过在CLI评测指令中添加`args`或在Eval脚本中添加配置即可将本次评测结果存储到您所指定的路径示例如下
方式1在指令中添加`args`选项并指定你的公共路径地址。
```bash
opencompass ... -sp '/your_path'
```
方式2在Eval脚本中添加配置。
```pythonE
station_path = '/your_path'
```
### 向数据站更新数据
上述存储方法在上传数据前会首先根据模型和数据集配置中的`abbr`属性来判断数据站中是否已有相同任务结果。若已有结果,则取消本次存储。如果您需要更新这部分结果,请在指令中添加`station-overwrite`选项,示例如下:
```bash
opencompass ... -sp '/your_path' --station-overwrite
```
### 读取数据站中已有的结果
您可以直接从数据站中读取已有的结果,以避免重复进行评测任务。读取到的结果会直接参与到`summarize`步骤。采用该配置时,仅有数据站中未存储结果的任务会被启动。示例如下:
```bash
opencompass ... -sp '/your_path' --read-from-station
```
### 指令组合
1. 仅向数据站上传最新工作目录下结果,不补充运行缺失结果的任务:
```bash
opencompass ... -sp '/your_path' -r latest -m viz
```
## 数据站存储格式
在数据站中,评测结果按照每个`model-dataset`对的结果存储为`json`文件。具体的目录组织形式为`/your_path/dataset_name/model_name.json`。每个`json`文件都存储了对应结果的字典,包括`predictions`、`results`以及`cfg`三个子项,具体示例如下:
```pythonE
Result = {
'predictions': List[Dict],
'results': Dict,
'cfg': Dict = {
'models': Dict,
'datasets': Dict,
(Only subjective datasets)'judge_models': Dict
}
}
```
其中,`predictions`记录了模型对数据集中每一条数据的prediction的结果`results`记录了模型在该数据集上的评分,`cfg`记录了该评测任务中模型和数据集的详细配置。

View File

@ -117,6 +117,10 @@ html_js_files = [
'js/custom.js'
]
html_context = {
'github_version': 'main',
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
@ -224,6 +228,7 @@ notfound_template = '404.html'
def builder_inited_handler(app):
subprocess.run(['./cp_origin_docs.sh'])
subprocess.run(['./statis.py'])
def setup(app):

View File

@ -40,8 +40,7 @@ OpenCompass 上手路线
user_guides/evaluation.md
user_guides/experimentation.md
user_guides/metrics.md
user_guides/summarizer.md
user_guides/corebench.md
user_guides/deepseek_r1.md
.. _提示词:
.. toctree::
@ -62,17 +61,13 @@ OpenCompass 上手路线
advanced_guides/custom_dataset.md
advanced_guides/new_model.md
advanced_guides/evaluation_lmdeploy.md
advanced_guides/evaluation_lightllm.md
advanced_guides/accelerator_intro.md
advanced_guides/math_verify.md
advanced_guides/llm_judge.md
advanced_guides/code_eval.md
advanced_guides/code_eval_service.md
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md
advanced_guides/contamination_eval.md
advanced_guides/compassbench_intro.md
advanced_guides/needleinahaystack_eval.md
advanced_guides/persistence.md
.. _工具:
.. toctree::
@ -81,6 +76,13 @@ OpenCompass 上手路线
tools.md
.. _数据集列表:
.. toctree::
:maxdepth: 1
:caption: 数据集列表
dataset_statistics.md
.. _其他说明:
.. toctree::
:maxdepth: 1

98
docs/zh_cn/statis.py Executable file
View File

@ -0,0 +1,98 @@
#! /usr/bin/env python
from pathlib import Path
import yaml
from tabulate import tabulate
OC_ROOT = Path(__file__).absolute().parents[2]
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
DATASETZOO_TEMPLATE = """\
# 数据集统计
在本页面中我们列举了OpenCompass所支持的所有数据集
你可以使用排序和搜索功能找到需要的数据集
我们对每一个数据集都给出了推荐的运行配置部分数据集中还提供了基于LLM Judge的推荐配置
你可以基于推荐配置快速启动评测但请注意推荐配置可能随时间推移被更新
"""
with open('dataset_statistics.md', 'w') as f:
f.write(DATASETZOO_TEMPLATE)
load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr', 'math500'
]
def table_format(data_list):
table_format_list = []
for i in data_list:
table_format_list_sub = []
for j in i:
if j in recommanded_dataset_list:
link_token = '[链接]('
else:
link_token = '[链接(TBD)]('
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')')
elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
for k in i[j][index]:
sub_list_text += (link_token + GITHUB_PREFIX + k +
') / ')
table_format_list_sub.append(sub_list_text[:-2])
else:
table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')')
else:
table_format_list_sub.append(i[j][index])
table_format_list.append(table_format_list_sub)
return table_format_list
data_format_list = table_format(data_list)
def generate_table(data_list, title=None):
with open('dataset_statistics.md', 'a') as f:
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',
stralign='center')
f.write(tabulate(data_list, header, **table_cfg))
f.write('\n```\n')
generate_table(
data_list=data_format_list,
title='## 支持数据集列表',
)

View File

@ -81,3 +81,42 @@ datasets += cmnli_datasets
用户可以根据需要,选择不同能力不同数据集以及不同评测方式的配置文件来构建评测脚本中数据集的部分。
有关如何启动评测任务,以及如何评测自建数据集可以参考相关文档。
### 数据集多次评测
在数据集配置中可以通过设置参数`n`来对同一数据集进行多次评测,最终返回平均指标,例如:
```python
afqmc_datasets = [
dict(
abbr="afqmc-dev",
type=AFQMCDatasetV2,
path="./data/CLUE/AFQMC/dev.json",
n=10, # 进行10次评测
reader_cfg=afqmc_reader_cfg,
infer_cfg=afqmc_infer_cfg,
eval_cfg=afqmc_eval_cfg,
),
]
```
另外对于二值评测指标例如accuracypass-rate等还可以通过设置参数`k`配合`n`进行[G-Pass@k](http://arxiv.org/abs/2412.13147)评测。G-Pass@k计算公式为
```{math}
\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right],
```
其中 $n$ 为评测次数, $c$ 为 $n$ 次运行中通过或正确的次数。配置例子如下:
```python
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
k=[2, 4], # 返回 G-Pass@2和G-Pass@4的结果
n=12, # 12次评测
...
)
]
```

View File

@ -0,0 +1,192 @@
# 强推理模型评测教程
OpenCompass提供针对DeepSeek R1系列推理模型的评测教程数学数据集
- 在模型层面我们建议使用Sampling方式以减少因为Greedy评测带来的大量重复
- 在数据集层面,我们对数据量较小的评测基准,使用多次评测并取平均的方式。
- 在答案验证层面为了减少基于规则评测带来的误判我们统一使用基于LLM验证的方式进行评测。
## 安装和准备
请按OpenCompass安装教程进行安装。
## 构建评测配置
我们在 `example/eval_deepseek_r1.py` 中提供了示例配置,以下对评测配置进行解读
### 评测配置解读
#### 1. 数据集与验证器配置
```python
# 支持多运行次数的数据集配置(示例)
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# 设置LLM验证器 用户需事先通过LMDeploy/vLLM/SGLang等工具启动API 评测服务器或者直接使用兼容OpenAI标准接口的模型服务
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # 需替换实际路径
key='YOUR_API_KEY', # 需替换真实API Key
openai_api_base=['http://your-api-endpoint'], # 需替换API地址
query_per_second=16,
batch_size=1024,
temperature=0.001,
max_out_len=16384
)
# 应用验证器到所有数据集
for item in datasets:
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
```
#### 2. 模型配置
我们提供了基于LMDeploy作为推理后端的评测示例用户可以通过修改path即HF路径
```python
# LMDeploy模型配置示例
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768
),
max_seq_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# 可扩展14B/32B配置...
]
```
#### 3. 评估流程配置
```python
# 推理配置
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
# 评估配置
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
```
#### 4. 结果汇总配置
```python
# 多运行结果平均配置
summary_groups = [
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
# 其他数据集平均配置...
]
summarizer = dict(
dataset_abbrs=[
['AIME2024-Aveage8', 'naive_average'],
# 其他数据集指标...
],
summary_groups=summary_groups
)
# 工作目录设置
work_dir = "outputs/deepseek_r1_reasoning"
```
## 执行评测
### 场景1模型1卡加载数据1个worker评测共使用1个GPU
```bash
opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
```
评测日志会在命令行输出。
### 场景2模型1卡加载数据8个worker评测共使用8个GPU
需要修改配置文件中的infer配置将num_worker设置为8
```python
# 推理配置
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
```
同时评测命令去掉`--debug`参数
```bash
opencompass example/eval_deepseek_r1.py --dump-eval-details
```
此模式下OpenCompass将使用多线程启动`$num_worker`个任务,命令行不展示具体日志,具体的评测日志将会在`$work_dir`下中展示。
### 场景3模型2卡加载数据4个worker评测共使用8个GPU
需要注意模型配置中,`run_cfg`中的`num_gpus`需要设置为2(如使用推理后端则推理后端的参数也需要同步修改比如LMDeploy中的tp需要设置为2),同时修改`infer`配置中的`num_worker`为4
```python
models += [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=128,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
]
```
```python
# 推理配置
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
```
### 评测结果
评测结果展示如下:
```bash
dataset version metric mode deepseek-r1-distill-qwen-7b-turbomind ---------------------------------- --------- ------------- ------ --------------------------------------- MATH - - - AIME2024-Aveage8 - naive_average gen 56.25
```
## 性能基线参考
由于模型使用Sampling进行解码同时AIME数据量较小使用8次评测取平均情况下仍会出现1-3分的性能抖动
| 模型 | 数据集 | 指标 | 数值 |
| ---------------------------- | -------- | -------- | ---- |
| DeepSeek-R1-Distill-Qwen-7B | AIME2024 | Accuracy | 56.3 |
| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2 |
| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2 |

View File

@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- `-w`: 指定工作路径,默认为 `./outputs/default`
- `-l`: 打开飞书机器人状态上报。
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
- `--dump-eval-details`: 开启`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
- `--dump-eval-details`: 默认开启,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。如不需要开启,需设置`--dump-eval-details False`。
以运行模式 `-m all` 为例,整体运行流如下:

View File

@ -0,0 +1,137 @@
# flake8: noqa
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
# Knowledge
# Math
from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
aime2024_datasets
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
bbh_datasets
# General Reasoning
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets
# Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
LCBCodeGeneration_dataset
from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
math_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets
# Model List
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Only take LCB generation for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
[]) + [LCBCodeGeneration_dataset]
# LLM judge config: using LLM to evaluate predictions
judge_cfg = dict()
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
core_summary_groups = [
{
'name':
'core_average',
'subsets': [
['IFEval', 'Prompt-level-strict-accuracy'],
['bbh', 'naive_average'],
['math_prm800k_500', 'accuracy'],
['aime2024', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['mmlu_pro', 'naive_average'],
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
'',
'Math Calculation',
['math_prm800k_500', 'accuracy'],
['aime2024', 'accuracy'],
'',
'Knowledge',
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask),
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
work_dir = './outputs/oc_academic_202502'

View File

@ -0,0 +1,127 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_evaluator import MATHEvaluator
from opencompass.datasets import (
MATHDataset,
math_postprocess_v2,
normalize_final_answer,
)
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets, Summarizer
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
reader_cfg = dict(input_columns=['problem'], output_column='solution')
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
########################## Evaluator #################################
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
),
judge_cfg=dict(),
)
rule_evaluator =dict(type=MATHEvaluator)
cascade_evaluator = dict(type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False
)
########################## #################################
eval_cfg = dict()
# eval_cfg['evaluator'] = rule_evaluator
# eval_cfg['evaluator'] = llm_judge_evaluator
eval_cfg['evaluator'] = cascade_evaluator
math_datasets = [
dict(
abbr='math_prm800k_500',
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
]
datasets = math_datasets
models = lmdeploy_qwen2_5_7b_instruct_model
work_dir = 'math_prm800k_500_cascade_evaluator'

View File

@ -0,0 +1,212 @@
# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard
import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
TurboMindModelwithChatTemplate,
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
with read_base():
# You can comment out the datasets you don't want to evaluate
# Datasets
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
# Summarizer
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# Set LLM Verifier used for each dataset
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
key='sk-1234', # You need to set your own API key
openai_api_base=[
'http://172.30.56.1:4000/v1', # You need to set your own API base
],
meta_template=dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
# max_seq_len=32768,
max_seq_len=49152,
)
for item in datasets:
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
#######################################################################
# PART 2 Model List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models += [
# You can comment out the models you don't want to evaluate
# All models use sampling mode
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=32768),
# max_seq_len=32768,
# max_out_len=32768,
# batch_size=128,
# run_cfg=dict(num_gpus=2),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=16384),
# max_seq_len=32768,
# max_out_len=16384,
# batch_size=128,
# run_cfg=dict(num_gpus=4),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
]
#######################################################################
# PART 3 Inference/Evaluation #
#######################################################################
# Inference configuration
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=1
# Similar with data-parallelism, how many workers for evaluation,
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
# to max-utilize the GPUs.
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask)
),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner, n=8
),
runner=dict(
type=LocalRunner,
task=dict(
type=OpenICLEvalTask)
),
)
#######################################################################
# PART 4 Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend([
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
{
'name': 'LiveMathBench-v202412-Hard-Aveage8',
'subsets':[[
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
]
}
])
# Summarizer
summarizer = dict(
dataset_abbrs=[
'MATH',
# ['LiveMathBench-k1-n1', 'pass@1'],
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
# ['aime2024', 'accuracy'],
['math_prm800k_500-llmjudge', 'accuracy'],
['AIME2024-Aveage8', 'naive_average'],
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
['OlympiadBenchMath', 'accuracy'],
['OmniMath', 'accuracy'],
],
summary_groups=summary_groups,
)
#######################################################################
# PART 5 Utils #
#######################################################################
work_dir = 'outputs/deepseek_r1_reasoning'

View File

@ -1,7 +1,7 @@
from mmengine.config import read_base
with read_base():
from .datasets.dingo.dingo_gen import datasets
from .models.hf_internlm.hf_internlm_7b import models
from opencompass.configs.datasets.dingo.dingo_gen import datasets
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
work_dir = './outputs/eval_dingo'

116
examples/eval_llm_judge.py Normal file
View File

@ -0,0 +1,116 @@
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
# Import pre-configured models from OpenCompass
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct_model,
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import CustomDataset
# Dataset reader configuration
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
# Inference configuration
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Template for the LLM judge
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration using LLM as judge
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
),
judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
# Dataset configuration
datasets = [
dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
# Model to be evaluated
models = lmdeploy_qwen2_5_7b_instruct_model
# Limiting test to first 8 examples for quick testing
math_reader_cfg['test_range'] = '[0:8]'
# Output directory
work_dir = 'outputs/llm_judge'

View File

@ -0,0 +1,77 @@
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from opencompass.configs.datasets.math.math_500_gen import math_datasets
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
gen_config=dict(
top_k=1,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
]
datasets = [*math_datasets]
work_dir = './outputs/math_500'

View File

@ -36,8 +36,8 @@ infer = dict(
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=[gpt_4o_2024_05_13_model],
judge_models=[gpt_4o_2024_05_13_model],
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=256,

View File

@ -1 +1 @@
__version__ = '0.4.0'
__version__ = '0.4.2'

View File

@ -12,7 +12,8 @@ from mmengine.config import Config, DictAction
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
from opencompass.runners import SlurmRunner
from opencompass.summarizers import DefaultSummarizer
from opencompass.utils import LarkReporter, get_logger
from opencompass.utils import (LarkReporter, get_logger, read_from_station,
save_to_station)
from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
get_config_from_arg)
@ -118,8 +119,11 @@ def parse_args():
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
'correctness of each sample, bpb, etc. Defaults to True.',
nargs='?',
const=True,
default=True,
type=lambda x: False if x and x.lower() == 'false' else True
)
parser.add_argument(
'--dump-extract-rate',
@ -127,6 +131,27 @@ def parse_args():
'correctness of each sample, bpb, etc.',
action='store_true',
)
parser.add_argument('-sp',
'--station-path',
help='Path to your results station.',
type=str,
default=None,
)
parser.add_argument('--station-overwrite',
help='Whether to overwrite the results at station.',
action='store_true',
)
parser.add_argument(
'--read-from-station',
help='Whether to save the evaluation results to the '
'data station.',
action='store_true',
)
# set srun args
slurm_parser = parser.add_argument_group('slurm_args')
parse_slurm_args(slurm_parser)
@ -177,8 +202,6 @@ def parse_dlc_args(dlc_parser):
type=str)
def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
@ -213,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):
def main():
args = parse_args()
if args.num_gpus is not None:
raise ValueError('The `--num-gpus` argument is deprecated, please use '
'`--hf-num-gpus` to describe number of gpus used for '
@ -243,9 +265,11 @@ def main():
else:
dir_time_str = args.reuse
logger.info(f'Reusing experiements from {dir_time_str}')
elif args.mode in ['eval', 'viz']:
raise ValueError('You must specify -r or --reuse when running in eval '
'or viz mode!')
elif args.mode in ['eval', 'viz'] and not args.read_from_station:
raise ValueError(
'You must specify -r or --reuse, or you have to specify '
'--read-from-station and --station-path when running in eval '
'or viz mode!')
# update "actual" work_dir
cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
@ -262,6 +286,12 @@ def main():
# types cannot be serialized
cfg = Config.fromfile(output_config_path, format_python_code=False)
# get existed results from station
if args.read_from_station:
existing_results_list = read_from_station(cfg, args)
rs_exist_results = [comb['combination'] for comb in existing_results_list]
cfg['rs_exist_results'] = rs_exist_results
# report to lark bot if specify --lark
if not args.lark:
cfg['lark_bot_url'] = None
@ -269,6 +299,7 @@ def main():
content = f'{getpass.getuser()}\'s task has been launched!'
LarkReporter(cfg['lark_bot_url']).post(content)
# infer
if args.mode in ['all', 'infer']:
# When user have specified --slurm or --dlc, or have not set
# "infer" in config, we will provide a default configuration
@ -321,6 +352,9 @@ def main():
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
logger.warning('Default to dump eval details, it might take extra'
'space to save all the evaluation details. '
'Set --dump-eval-details False to skip the details dump')
cfg.eval.runner.task.dump_details = True
if args.dump_extract_rate:
cfg.eval.runner.task.cal_extract_rate = True
@ -350,6 +384,10 @@ def main():
else:
runner(tasks)
# save to station
if args.station_path is not None or cfg.get('station_path') is not None:
save_to_station(cfg, args)
# visualize
if args.mode in ['all', 'eval', 'viz']:
summarizer_cfg = cfg.get('summarizer', {})

View File

@ -0,0 +1,56 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
# The system_prompt defines the initial instructions for the model,
# setting the context for solving ARC tasks.
system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
# User message template is a template for creating user prompts. It includes placeholders for training data and test input data,
# guiding the model to learn the rule and apply it to solve the given puzzle.
user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
----------------------------------------
{training_data}
----------------------------------------
Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
----------------------------------------
[{{'input': {input_test_data}, 'output': [[]]}}]
----------------------------------------
What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
arc_prize_public_evaluation_reader_cfg = dict(
input_columns=['training_data', 'input_test_data'],
output_column='output_test_data'
)
arc_prize_public_evaluation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='SYSTEM',fallback_role='HUMAN', prompt=system_prompt),
dict(role='HUMAN', prompt=user_message_template),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
arc_prize_public_evaluation_eval_cfg = dict(
evaluator=dict(type=ARCPrizeEvaluator)
)
arc_prize_public_evaluation_datasets = [
dict(
abbr='ARC_Prize_Public_Evaluation',
type=ARCPrizeDataset,
path='opencompass/arc_prize_public_evaluation',
reader_cfg=arc_prize_public_evaluation_reader_cfg,
infer_cfg=arc_prize_public_evaluation_infer_cfg,
eval_cfg=arc_prize_public_evaluation_eval_cfg
)
]

View File

@ -0,0 +1,45 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GaokaoBenchDataset
from mmengine.config import read_base
with read_base():
from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
GaokaoBench_datasets = []
for folder, prompts in [
('Multiple-choice_Questions', MCQ_prompts),
('Fill-in-the-blank_Questions', FBQ_prompts),
]:
for p in prompts:
reader_cfg = {
'input_columns': ['question'],
'output_column': 'answer',
}
infer_cfg = {
'ice_template': {
'type': PromptTemplate,
'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
'ice_token': '</E>',
},
'retriever': {'type': ZeroRetriever},
'inferencer': {'type': GenInferencer},
}
eval_cfg = {
'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
'pred_role': 'BOT',
}
_base_path = 'opencompass/GAOKAO-BENCH'
dataset = {
'type': GaokaoBenchDataset,
'abbr': 'GaokaoBench_' + p['keyword'],
'path': _base_path,
'filename': '/' + folder + '/' + p['keyword'] + '.json',
'name': p['keyword'],
'reader_cfg': reader_cfg,
'infer_cfg': infer_cfg,
'eval_cfg': eval_cfg,
}
GaokaoBench_datasets.append(dataset)

View File

@ -0,0 +1,5 @@
from mmengine.config import read_base
with read_base():
# Default use LLM as a judge
from .hle_llmverify_gen_6ff468 import hle_datasets # noqa: F401, F403

View File

@ -0,0 +1,91 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import HLEDataset
# ----------------------------- Detailed Config -----------------------------
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=HLEDataset,
path='cais/hle',
reader_cfg=math_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
hle_datasets = [
dict(
type=HLEDataset,
abbr='hle_llmjudge',
path='cais/hle',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, F403
from .IFEval_gen_353ae7 import ifeval_datasets # noqa: F401, F403

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 0
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
# Use PPL mode in single choice test or not
use_ppl_single_choice = False
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=math_postprocess_v2)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -0,0 +1,57 @@
from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'medical_task',
'body_system',
'question_type',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=MedXpertQAEvaluator),
pred_role='BOT',
)
medxpertqa_dataset = dict(
type=MedXpertQADataset,
abbr='medxpertqa',
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
medxpertqa_datasets = [medxpertqa_dataset]

View File

@ -0,0 +1,104 @@
from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'medical_task',
'body_system',
'question_type',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MedXpertQADataset,
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
),
)
medxpertqa_dataset = dict(
type=MedXpertQADataset,
abbr='medxpertqa',
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
medxpertqa_datasets = [medxpertqa_dataset]

View File

@ -0,0 +1,60 @@
# OlymMATH
[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
Dataset OlymMATH, please refer to the paper:
Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
## How to eval OlymMATH with model judge
This is a simple example:
```python
from opencompass.models import OpenAISDK, OpenAI
from mmengine.config import read_base
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
################## Judge Config ##################
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
judge_cfg = dict(
# An API model with OpenAI API format is required for Judge
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct',
key='sk-1234',
openai_api_base=[
'http://172.30.56.1:4000/v1',
],
meta_template=api_meta_template,
query_per_second=16,
batch_size=1024,
temperature=0.001,
max_completion_tokens=32768,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
max_seq_len=32768,
)
################## Model Config ##################
models = [*qwen2_5_7b_instruct_model]
################## Dataset Config ##################
datasets = [*olymmath_datasets]
# Set judge_cfg for evaluation
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 32768
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
work_dir = './outputs/olymmath_llm_eval'
```

View File

@ -0,0 +1,5 @@
from mmengine.config import read_base
with read_base():
# Default use LLM as a judge
from .olymmath_llmverify_gen_97b203 import olymmath_datasets # noqa: F401, F403

View File

@ -0,0 +1,99 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import OlymMATHDataset
# ----------------------------- Detailed Config -----------------------------
math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration
olymmath_datasets = []
for sub_set in sub_sets:
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=OlymMATHDataset,
path='RUC-AIBOX/OlymMATH',
reader_cfg=math_reader_cfg,
subset=sub_set,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
olymmath_datasets.append(
dict(
type=OlymMATHDataset,
abbr=f'olymmath_llmjudge_{sub_set}',
path='RUC-AIBOX/OlymMATH',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
subset=sub_set,
)
)

View File

@ -0,0 +1,105 @@
from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
with read_base():
from .OlympiadBench_categories import math_categories as categories
# Create prompter instance for problems
olympiadbench_prompter_cfg = dict(
type='OlympiadBenchPrompter'
)
olympiadbench_reader_cfg = dict(
input_columns=[
'problem', 'language', 'subject', 'question_type',
'answer_type', 'is_multiple_answer', 'unit', 'questions'
],
output_column='solution'
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
olympiadbenchMath_datasets = []
for _name in categories:
olympiadbench_infer_cfg = dict(
prompt_template=dict(
type='OlympiadBenchTemplate'
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
olympiadbench_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=OlympiadBenchDataset,
path='opencompass/OlympiadBench',
name=_name,
reader_cfg=olympiadbench_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
olympiadbenchMath_datasets.append(
dict(
type=OlympiadBenchDataset,
abbr=f'OlympiadBench_{_name}',
path='opencompass/OlympiadBench',
name=_name,
reader_cfg=olympiadbench_reader_cfg,
infer_cfg=olympiadbench_infer_cfg,
eval_cfg=olympiadbench_eval_cfg,
)
)
del _name

View File

@ -0,0 +1,109 @@
from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
with read_base():
from .OlympiadBench_categories import categories
# Create prompter instance for problems
olympiadbench_prompter_cfg = dict(
type='OlympiadBenchPrompter'
)
olympiadbench_reader_cfg = dict(
input_columns=[
'problem', 'language', 'subject', 'question_type',
'answer_type', 'is_multiple_answer', 'unit', 'questions'
],
output_column='solution'
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
olympiadbench_datasets = []
for _name in categories:
olympiadbench_infer_cfg = dict(
prompt_template=dict(
type='OlympiadBenchTemplate'
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# olympiadbench_eval_cfg = dict(
# evaluator=dict(type=OlympiadBenchEvaluator, version='v2'),
# pred_postprocessor=dict(type=olympiadbench_postprocess_v2),
# )
# Evaluation configuration
olympiadbench_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=OlympiadBenchDataset,
path='opencompass/OlympiadBench',
name=_name,
reader_cfg=olympiadbench_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
olympiadbench_datasets.append(
dict(
type=OlympiadBenchDataset,
abbr=f'OlympiadBench_{_name}',
path='opencompass/OlympiadBench',
name=_name,
reader_cfg=olympiadbench_reader_cfg,
infer_cfg=olympiadbench_infer_cfg,
eval_cfg=olympiadbench_eval_cfg,
)
)
del _name

View File

@ -5,3 +5,14 @@ categories = [
'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
]
math_categories = [
'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
]
physics_categories = [
'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
]

View File

@ -0,0 +1,98 @@
# flake8: noqa
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import Aime2024Dataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
aime2024_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN',
prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048)
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2024_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE),
]),
),
dataset_cfg=dict(
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
metric_name='accuracy'),
),
pred_role='BOT',
)
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
mode='singlescore',
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .aime2024_gen_6e39a4 import aime2024_datasets # noqa: F401, F403
from .aime2024_gen_17d799 import aime2024_datasets # noqa: F401, F403

View File

@ -0,0 +1,40 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import MATHEvaluator
from opencompass.datasets import Aime2024Dataset
aime2024_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
aime2024_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator)
)
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets # noqa: F401, F403

View File

@ -0,0 +1,90 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import Aime2024Dataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2024_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
)
)
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
)
]

View File

@ -0,0 +1,96 @@
# CoT: No CoT
# K-Shot: 0-Shot
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
aime2024_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2024_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
aime2024_datasets = [
dict(
abbr=f'aime2024-run{idx}',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
mode='singlescore',
)
for idx in range(16)
]

View File

@ -0,0 +1,96 @@
# CoT: No CoT
# K-Shot: 0-Shot
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
aime2024_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2024_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
aime2024_datasets = [
dict(
abbr=f'aime2024-run{idx}',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
mode='singlescore',
)
for idx in range(8)
]

View File

@ -0,0 +1,90 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CustomDataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
aime2025_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2025_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='opencompass/aime2025',
reader_cfg=aime2025_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
aime2025_datasets = [
dict(
type=CustomDataset,
abbr='aime2025',
path='opencompass/aime2025',
reader_cfg=aime2025_reader_cfg,
infer_cfg=aime2025_infer_cfg,
eval_cfg=aime2025_eval_cfg,
)
]

View File

@ -0,0 +1,26 @@
# BB#H
```bash
python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
```
## Models
| model | score |
|:-----------------------------------------:|------:|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 10.93 |
### Details
| model | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 14.00 | 33.33 | 13.50 | 1.00 | 28.00 | 11.00 | 10.00 | 18.50 |
| model | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 0.00 | 42.50 | 3.50 | 2.00 | 0.00 | 0.00 | 1.00 | 17.00 |
| model | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 4.00 | 5.00 | 2.00 | 3.00 | 7.50 | 2.00 | 3.50 |

View File

@ -0,0 +1,93 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbeh_multiple_choice_sets = [
'bbeh_boolean_expressions',
'bbeh_disambiguation_qa',
'bbeh_geometric_shapes',
'bbeh_hyperbaton',
'bbeh_movie_recommendation',
'bbeh_nycc',
'bbeh_shuffled_objects',
]
bbeh_free_form_sets = [
'bbeh_boardgame_qa',
'bbeh_buggy_tables',
'bbeh_causal_understanding',
'bbeh_dyck_languages',
'bbeh_linguini',
'bbeh_multistep_arithmetic',
'bbeh_object_counting',
'bbeh_object_properties',
'bbeh_sarc_triples',
'bbeh_spatial_reasoning',
'bbeh_sportqa',
'bbeh_temporal_sequence',
'bbeh_time_arithmetic',
'bbeh_web_of_lies',
'bbeh_word_sorting',
'bbeh_zebra_puzzles',
]
bbeh_datasets = []
for _name in bbeh_multiple_choice_sets:
bbeh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=8192))
bbeh_eval_cfg = dict(
evaluator=dict(type=BBEHEvaluator_mcq),
pred_role='BOT',
pred_postprocessor=dict(type=bbeh_mcq_postprocess),
dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
bbeh_datasets.append(
dict(
type=BBEHDataset,
path='opencompass/bbeh',
name=_name,
abbr=_name,
reader_cfg=bbeh_reader_cfg,
infer_cfg=bbeh_infer_cfg.copy(),
eval_cfg=bbeh_eval_cfg.copy()))
for _name in bbeh_free_form_sets:
bbeh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=8192))
bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
bbeh_datasets.append(
dict(
type=BBEHDataset,
path='opencompass/bbeh',
name=_name,
abbr=_name,
reader_cfg=bbeh_reader_cfg,
infer_cfg=bbeh_infer_cfg.copy(),
eval_cfg=bbeh_eval_cfg.copy()))

View File

@ -0,0 +1,126 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
BBEHDataset,
generic_llmjudge_postprocess,
)
from opencompass.evaluator import GenericLLMEvaluator
bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbeh_multiple_choice_sets = [
'bbeh_boolean_expressions',
'bbeh_disambiguation_qa',
'bbeh_geometric_shapes',
'bbeh_hyperbaton',
'bbeh_movie_recommendation',
'bbeh_nycc',
'bbeh_shuffled_objects',
]
bbeh_free_form_sets = [
'bbeh_boardgame_qa',
'bbeh_buggy_tables',
'bbeh_causal_understanding',
'bbeh_dyck_languages',
'bbeh_linguini',
'bbeh_multistep_arithmetic',
'bbeh_object_counting',
'bbeh_object_properties',
'bbeh_sarc_triples',
'bbeh_spatial_reasoning',
'bbeh_sportqa',
'bbeh_temporal_sequence',
'bbeh_time_arithmetic',
'bbeh_web_of_lies',
'bbeh_word_sorting',
'bbeh_zebra_puzzles',
]
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
bbeh_datasets = []
for _name in bbeh_multiple_choice_sets + bbeh_free_form_sets:
bbeh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: ",
)
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
bbeh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=BBEHDataset,
path='opencompass/bbeh',
name=_name,
abbr=_name,
reader_cfg=bbeh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
bbeh_datasets.append(
dict(
type=BBEHDataset,
path='opencompass/bbeh',
name=_name,
abbr=_name,
reader_cfg=bbeh_reader_cfg,
infer_cfg=bbeh_infer_cfg,
eval_cfg=bbeh_eval_cfg,
)
)

View File

@ -0,0 +1,189 @@
# flake8: noqa
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import BBHDataset
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
# For zero shot inference in bbh
bbh_datasets = []
for _name in bbh_sets:
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy())
)
# For original 3 shot inference in bbh
bbh_3_shot_datasets = []
for _name in bbh_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_3_shot_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403
from .bbh_gen_ee62e9 import bbh_datasets # noqa: F401, F403

View File

@ -0,0 +1,99 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
bbh_datasets = []
for _name in bbh_multiple_choice_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator_mcq),
pred_role='BOT',
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
for _name in bbh_free_form_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets # noqa: F401, F403

View File

@ -0,0 +1,189 @@
# flake8: noqa
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import BBHDataset
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
# For zero shot inference in bbh
bbh_datasets = []
for _name in bbh_sets:
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy())
)
# For original 3 shot inference in bbh
bbh_3_shot_datasets = []
for _name in bbh_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_3_shot_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -1,53 +1,43 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
BigCodeBenchDataset,
BigCodeBenchEvaluator
)
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_full_reader_cfg = dict(
input_columns=['complete_prompt'],
output_column='test',
input_columns=['complete_prompt'],
output_column='test',
)
bigcodebench_full_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system',
fallback_role='HUMAN',
prompt='')],
round=[
dict(role='HUMAN', prompt='{complete_prompt}'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{complete_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,
max_out_len=1024))
bigcodebench_full_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='complete',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full',
),
pred_role='BOT',
)
bigcodebench_full_complete_datasets = [
dict(
abbr='bigcodebench_full_complete',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2'
)
]
dict(abbr='bigcodebench_full_complete',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2')
]

View File

@ -1,53 +1,43 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
BigCodeBenchDataset,
BigCodeBenchEvaluator
)
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_full_reader_cfg = dict(
input_columns=['instruct_prompt'],
output_column='test',
input_columns=['instruct_prompt'],
output_column='test',
)
bigcodebench_full_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system',
fallback_role='HUMAN',
prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=8192)
)
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,
max_out_len=8192))
bigcodebench_full_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full',
),
pred_role='BOT',
)
bigcodebench_full_instruct_datasets = [
dict(
abbr='bigcodebench_full_instruct',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2'
)
]
dict(abbr='bigcodebench_full_instruct',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2')
]

View File

@ -0,0 +1,7 @@
from mmengine.config import read_base
with read_base():
from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets
from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets
bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), [])

View File

@ -0,0 +1,45 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_hard_reader_cfg = dict(
input_columns=['complete_prompt'],
output_column='test',
)
bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{complete_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bigcodebench_hard_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='complete',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard',
),
pred_role='BOT',
)
bigcodebench_hard_complete_datasets = [
dict(
abbr='bigcodebench_hard_complete',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_hard_reader_cfg,
infer_cfg=bigcodebench_hard_infer_cfg,
eval_cfg=bigcodebench_hard_eval_cfg,
release_version='v0.1.2',
dataset_version='hard',
)
]

View File

@ -1,40 +1,32 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
BigCodeBenchDataset,
BigCodeBenchEvaluator
)
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_hard_reader_cfg = dict(
input_columns=['complete_prompt'],
output_column='test',
input_columns=['complete_prompt'],
output_column='test',
)
bigcodebench_hard_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system',
fallback_role='HUMAN',
prompt='')],
round=[
dict(role='HUMAN', prompt='{complete_prompt}'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{complete_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,
max_out_len=1024))
bigcodebench_hard_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='complete',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard',
),
pred_role='BOT',
@ -51,4 +43,4 @@ bigcodebench_hard_complete_datasets = [
release_version='v0.1.2',
dataset_version='hard',
)
]
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets # noqa: F401, F403
from .bigcodebench_hard_instruct_gen_c3d5ad import bigcodebench_hard_instruct_datasets # noqa: F401, F403

View File

@ -1,40 +1,32 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
BigCodeBenchDataset,
BigCodeBenchEvaluator
)
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_hard_reader_cfg = dict(
input_columns=['instruct_prompt'],
output_column='test',
input_columns=['instruct_prompt'],
output_column='test',
)
bigcodebench_hard_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system',
fallback_role='HUMAN',
prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=8192)
)
bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,
max_out_len=8192))
bigcodebench_hard_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard',
),
pred_role='BOT',
@ -51,4 +43,4 @@ bigcodebench_hard_instruct_datasets = [
release_version='v0.1.2',
dataset_version='hard',
)
]
]

View File

@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_hard_reader_cfg = dict(
input_columns=['instruct_prompt'],
output_column='test',
)
bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
bigcodebench_hard_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard',
),
pred_role='BOT',
)
bigcodebench_hard_instruct_datasets = [
dict(
abbr='bigcodebench_hard_instruct',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_hard_reader_cfg,
infer_cfg=bigcodebench_hard_infer_cfg,
eval_cfg=bigcodebench_hard_eval_cfg,
release_version='v0.1.2',
dataset_version='hard',
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets # noqa: F401, F403

View File

@ -0,0 +1,185 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教',
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix + QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test',
),
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test',
),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
mode='singlescore',
)
)
del _name, _ch_name

View File

@ -0,0 +1,39 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
cmo_fib_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
cmo_fib_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\n请一步一步地推理,并将最终答案写入\\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
cmo_fib_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
)
cmo_fib_datasets = [
dict(
abbr='cmo_fib',
type=CMOFibDataset,
path='opencompass/cmo_fib',
reader_cfg=cmo_fib_reader_cfg,
infer_cfg=cmo_fib_infer_cfg,
eval_cfg=cmo_fib_eval_cfg
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .drop_openai_simple_evals_gen_3857b0 import drop_datasets
from .drop_openai_simple_evals_gen_3857b0 import drop_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .drop_llmjudge_gen_3857b0 import drop_datasets # noqa: F401, F403

View File

@ -0,0 +1,89 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import DropOpenAIDataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
with read_base():
from .drop_examples import drop_examples # noqa: F401, F403
drop_reader_cfg = dict(
input_columns=['prompt'],
output_column='answers',
train_split='validation',
test_split='validation',
)
template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {prompt}\n \n<Original Question End>\n\n
<Gold Target Begin>: \n{answers}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
drop_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt=template)]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
drop_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=DropOpenAIDataset,
path='data/drop_simple_eval/dev.jsonl',
reader_cfg=drop_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
drop_datasets = [
dict(
abbr='drop',
type=DropOpenAIDataset,
path='data/drop_simple_eval/dev.jsonl',
reader_cfg=drop_reader_cfg,
infer_cfg=drop_infer_cfg,
eval_cfg=drop_eval_cfg,
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .gpqa_0shot_nocot_genericllmeval_gen_772ea0 import gpqa_datasets # noqa: F401, F403

View File

@ -0,0 +1,37 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
gsm8k_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
)
gsm8k_datasets = [
dict(
abbr='gsm8k',
type=GSM8KDataset,
path='opencompass/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg,
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .hellaswag_gen_6faab5 import hellaswag_datasets # noqa: F401, F403
from .hellaswag_10shot_gen_e42710 import hellaswag_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets # noqa: F401, F403

View File

@ -0,0 +1,97 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import HellaswagDatasetwithICE
from opencompass.utils.text_postprocessors import first_option_postprocess
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
hellaswag_reader_cfg = dict(
input_columns=['ctx', 'A', 'B', 'C', 'D'],
output_column='label',
train_split='train',
test_split='val',
)
align_prompt = """Continue the following text without adding any additional information or formatting:
{ctx}
A) {A}
B) {B}
C) {C}
D) {D}
What is the right option?'"""
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {ctx}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
hellaswag_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=align_prompt),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
hellaswag_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=HellaswagDatasetwithICE,
path='opencompass/hellaswag_ice',
reader_cfg=hellaswag_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
hellaswag_datasets = [
dict(
abbr='hellaswag',
type=HellaswagDatasetwithICE,
path='opencompass/hellaswag_ice',
reader_cfg=hellaswag_reader_cfg,
infer_cfg=hellaswag_infer_cfg,
eval_cfg=hellaswag_eval_cfg,
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403
from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets # noqa: F401, F403

Some files were not shown because too many files have changed in this diff Show More