* 'main' of https://github.com/domonic18/opencompass:
  [Refactor] Refactorize openicl eval task (#1990)
  [ci] update baseline for kernal change of vllm and lmdeploy (#2011)
  [Feature] Make dump-eval-details default behavior (#1999)
  [Fix] OpenICL Math Evaluator Config (#2007)
  [Feature] Add CascadeEvaluator (#1992)
  [Dataset] Add MedXpertQA (#2002)
  [Dataset] Update dingo 1.5.0 (#2008)
  [CI] fix baseline score (#2000)
  [Doc] Fix links between zh & en (#2001)
This commit is contained in:
Deadwalk 2025-04-10 11:11:35 +08:00
commit 72b7caa575
59 changed files with 2285 additions and 1426 deletions

View File

@ -24,9 +24,9 @@ models = [
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://0.0.0.0:23333/v1',
path='internlm2',
tokenizer_path='internlm/internlm2_5-7b-chat',
openai_api_base='http://localhost:23333/v1',
path='internlm3',
tokenizer_path='internlm/internlm3-8b-instruct',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,

View File

@ -11,18 +11,10 @@ with read_base():
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
models as lmdeploy_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
models as hf_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
models as hf_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -49,12 +41,6 @@ with read_base():
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
models as hf_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@ -65,14 +51,14 @@ with read_base():
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
models as hf_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \

View File

@ -15,14 +15,24 @@ with read_base():
models as vllm_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
models as hf_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
models as \
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
models as \
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
models as \
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
models as \
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@ -45,6 +55,8 @@ with read_base():
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@ -57,6 +69,8 @@ with read_base():
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@ -83,10 +97,6 @@ with read_base():
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
models as \
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
@ -95,14 +105,19 @@ with read_base():
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
models as \
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
models as \
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_4 import \
models as hf_phi_4_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@ -142,6 +157,8 @@ with read_base():
from ...volc import infer as volc_infer # noqa: F401, E501
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -175,10 +175,11 @@ class TestApibench:
class TestVolcFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
] for p2 in dataset_list(p1, 'objective')])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
@ -245,10 +246,7 @@ class TestCmdCase:
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
('internlm2_5-7b-hf', 'race-high_accuracy'),
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
('internlm2-1.8b-hf', 'race-middle_accuracy'),
('internlm2-1.8b-hf', 'race-high_accuracy'),
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
@ -260,9 +258,9 @@ class TestCmdCase:
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
@ -280,13 +278,25 @@ class TestCmdCase:
@pytest.mark.case4
@pytest.mark.parametrize(
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
'model, dataset',
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score, dataset)
assert_score(model + '_batch', result_score, base_score, dataset)
@pytest.mark.case5
@pytest.mark.parametrize(
'model, dataset',
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score, dataset)
def assert_score(model_type, score, baseline, dataset: str = ''):

View File

@ -8,20 +8,25 @@ internlm2_5-7b_hf:
race-middle_accuracy: 91.78
race-high_accuracy: 90.02
internlm2-1.8b-hf:
demo_gsm8k_accuracy: 15.62
race-middle_accuracy: 71.66
race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy:
demo_gsm8k_accuracy: 89.06
demo_gsm8k_accuracy: 87.50
race-middle_accuracy: 92.76
race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy:
demo_gsm8k_accuracy: 31
race-middle_accuracy: 81.34
race-high_accuracy: 73.96
internlm3-8b-instruct-lmdeploy:
demo_gsm8k_accuracy: 73.44
race-middle_accuracy: 93.38
race-high_accuracy: 90.34
internlm3-8b-instruct_hf-lmdeploy:
demo_gsm8k_accuracy: 73.44
race-middle_accuracy: 93.38
race-high_accuracy: 90.34
internlm3-8b-instruct_hf-vllm:
demo_gsm8k_accuracy: 81.25
race-middle_accuracy: 92.20
race-high_accuracy: 89.88
internlm2_5-7b-chat_hf:
demo_gsm8k_accuracy: 87.50
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
race-high_accuracy: 90.48
lmdeploy-api-test:
gsm8k_accuracy: 68.75
race-middle_accuracy: 87.50
gsm8k_accuracy: 56.25
race-middle_accuracy: 93.75
race-high_accuracy: 93.75

View File

@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
drop_accuracy: 81.25
GPQA_diamond_accuracy: 25
hellaswag_accuracy: 87.5
TheoremQA_score: 18.75
TheoremQA_score: 12.50
musr_average_naive_average: 39.58
korbench_single_naive_average: 40
gsm8k_accuracy: 62.50
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
college_knowledge_naive_average: 87.5
subjective:
alignment_bench_v1_1_总分: 0.66
alpaca_eval_total: 20
alpaca_eval_total: 0
arenahard_score: 50
Followbench_naive_average: 1
CompassArena_naive_average: 43
mtbench101_avg: 7.8
wildbench_average: -12.78
wildbench_average: -15.56
simpleqa_accuracy_given_attempted: 0
chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_专业能力: 7.90
alignment_bench_v1_1_专业能力: 8.00
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 20
alpaca_eval_helpful_base: 0
compassarena_language_naive_average: 35
compassarena_knowledge_naive_average: 55
compassarena_reason_v2_naive_average: 40
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
internlm2_5-7b-chat-turbomind_fullbench:
objective:
race-high_accuracy: 93.75
ARC-c_accuracy: 93.75
ARC-c_accuracy: 87.50
BoolQ_accuracy: 68.75
triviaqa_wiki_1shot_score: 50
nq_open_1shot_score: 25
IFEval_Prompt-level-strict-accuracy: 56.25
drop_accuracy: 81.25
drop_accuracy: 75
GPQA_diamond_accuracy: 31.25
hellaswag_accuracy: 81.25
TheoremQA_score: 6.25
hellaswag_accuracy: 87.5
TheoremQA_score: 12.5
musr_average_naive_average: 39.58
korbench_single_naive_average: 37.50
gsm8k_accuracy: 68.75
math_accuracy: 68.75
korbench_single_naive_average: 40
gsm8k_accuracy: 62.5
math_accuracy: 75
cmo_fib_accuracy: 6.25
aime2024_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 68.75
ds1000_naive_average: 16.96
ds1000_naive_average: 17.86
lcb_code_generation_pass@1: 12.5
lcb_code_execution_pass@1: 43.75
lcb_test_output_pass@1: 25.00
bbh-logical_deduction_seven_objects_score: 50.00
bbh-multistep_arithmetic_two_score: 68.75
mmlu-other_accuracy: 69.71
cmmlu-china-specific_accuracy: 75.83
lcb_test_output_pass@1: 18.75
bbh-logical_deduction_seven_objects_score: 56.25
bbh-multistep_arithmetic_two_score: 75
mmlu-other_accuracy: 72.6
cmmlu-china-specific_accuracy: 78.33
mmlu_pro_math_accuracy: 31.25
ds1000_Pandas_accuracy: 0
ds1000_Pandas_accuracy: 12.5
ds1000_Numpy_accuracy: 0
ds1000_Tensorflow_accuracy: 12.5
ds1000_Scipy_accuracy: 18.75
ds1000_Scipy_accuracy: 25
ds1000_Sklearn_accuracy: 18.75
ds1000_Pytorch_accuracy: 18.75
ds1000_Pytorch_accuracy: 6.25
ds1000_Matplotlib_accuracy: 50.00
openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_naive_average: 12.50
college_knowledge_naive_average: 87.5
subjective:
alignment_bench_v1_1_总分: 0.70
alignment_bench_v1_1_总分: 0.66
alpaca_eval_total: 0
arenahard_score: 50
Followbench_naive_average: 1
CompassArena_naive_average: 38
mtbench101_avg: 7.80
wildbench_average: -4.86
CompassArena_naive_average: 40
mtbench101_avg: 8
wildbench_average: -6.81
simpleqa_accuracy_given_attempted: 0
chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_专业能力: 8.4
alignment_bench_v1_1_专业能力: 7.9
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 0
compassarena_language_naive_average: 35
compassarena_knowledge_naive_average: 50
compassarena_reason_v2_naive_average: 30
compassarena_math_v2_naive_average: 50
compassarena_creationv2_zh_naive_average: 25
compassarena_knowledge_naive_average: 45
compassarena_reason_v2_naive_average: 25
compassarena_math_v2_naive_average: 60
compassarena_creationv2_zh_naive_average: 35
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75
TheoremQA_score: 25
TheoremQA_score: 12.50
winogrande_accuracy: 75
gsm8k_accuracy: 37.5
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75
TheoremQA_score: 25.00
TheoremQA_score: 12.50
winogrande_accuracy: 87.5
gsm8k_accuracy: 62.50
GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
gsm8k_accuracy: 56.25
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
math_accuracy: 18.75
wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 62.50
dingo_en_192_score: 31.25
dingo_en_192_score: 50.00
dingo_zh_170_score: 93.75
mmlu-other_accuracy: 76.92
cmmlu-china-specific_accuracy: 84.17
mmlu_pro_math_accuracy: 18.75
bbh-logical_deduction_seven_objects_score: 50
bbh-logical_deduction_seven_objects_score: 43.75
bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_total: 25.96
arenahard_score: 17.15
Followbench_naive_average: 0.81
CompassArena_naive_average: 34.61
CompassArena_naive_average: 39.49
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -10.49
@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_koala: 28.21
alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 25
compassarena_language_naive_average: 52.5
alpaca_eval_vicuna: 33.75
compassarena_language_naive_average: 58.50
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_math_v2_naive_average: 25.95
compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
longbench_few-shot-learning_score: 51.67
longbench_synthetic-tasks_score: 66.83
longbench_code-completion_score: 45.99
qwen2.5-7b-instruct-turbomind:
objective:
race-high_accuracy: 84.99
ARC-c_accuracy: 92.2
BoolQ_accuracy: 86.7
triviaqa_wiki_1shot_score: 53.06
nq_open_1shot_score: 17.51
mmmlu_lite_naive_average: 54.96
IFEval_Prompt-level-strict-accuracy: 71.53
drop_accuracy: 80.07
bbh_naive_average: 68.81
GPQA_diamond_accuracy: 34.34
hellaswag_accuracy: 85.42
TheoremQA_score: 18.38
musr_average_naive_average: 43.44
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 92.57
GaokaoBench_weighted_average: 80.14
math_accuracy: 73.58
cmo_fib_accuracy: 25
aime2024_accuracy: 16.67
Mathbench_naive_average: 77.33
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
cmmlu_naive_average: 75.97
mmlu_naive_average: 76.01
mmlu_pro_naive_average: 56.12
openai_humaneval_humaneval_pass@1: 83.54
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.29
ds1000_naive_average: 18.66
lcb_code_generation_pass@1: 39.5
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.68
bigcodebench_hard_instruct_pass@1: 16.22
bigcodebench_hard_complete_pass@1: 11.49
teval_naive_average: 79.72
SciCode_sub_accuracy: 10.76
qa_dingo_cn_score: 99.01
mmlu_accuracy: 76.01
mmlu-stem_accuracy: 77.59
mmlu-social-science_accuracy: 79.02
mmlu-humanities_accuracy: 72.07
mmlu-other_accuracy: 74.86
cmmlu_accuracy: 75.97
cmmlu-stem_accuracy: 73.09
cmmlu-social-science_accuracy: 75.95
cmmlu-humanities_accuracy: 76.53
cmmlu-other_accuracy: 78.79
cmmlu-china-specific_accuracy: 73.17
mmlu_pro_accuracy: 56.12
mmlu_pro_biology_accuracy: 71.41
mmlu_pro_business_accuracy: 67.68
mmlu_pro_chemistry_accuracy: 54.59
mmlu_pro_computer_science_accuracy: 58.29
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 42.41
mmlu_pro_health_accuracy: 55.87
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 28.97
mmlu_pro_math_accuracy: 73.13
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 58.43
mmlu_pro_psychology_accuracy: 63.16
mmlu_pro_other_accuracy: 53.57
humanevalx-python_pass@1: 50
humanevalx-cpp_pass@1: 42.07
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 75
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.18
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 10.43
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 60.65
mmmlu_lite_accuracy: 54.96
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.25
openai_mmmlu_lite_DE-DE_accuracy: 59.93
openai_mmmlu_lite_ES-LA_accuracy: 66.53
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 49.26
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.47
openai_mmmlu_lite_JA-JP_accuracy: 61.54
openai_mmmlu_lite_KO-KR_accuracy: 60.28
openai_mmmlu_lite_PT-BR_accuracy: 55.51
openai_mmmlu_lite_SW-KE_accuracy: 36.42
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
college_naive_average: 48
high_naive_average: 59
middle_naive_average: 78
primary_naive_average: 85.67
arithmetic_naive_average: 75.67
mathbench-a (average)_naive_average: 69.27
college_knowledge_naive_average: 83.86
high_knowledge_naive_average: 80.29
middle_knowledge_naive_average: 84.26
primary_knowledge_naive_average: 93.16
mathbench-t (average)_naive_average: 85.39
internlm2_5-7b-chat-pytorch:
objective:
race-high_accuracy: 86.39
ARC-c_accuracy: 90.51
BoolQ_accuracy: 88.01
triviaqa_wiki_1shot_score: 64.77
nq_open_1shot_score: 22.71
mmmlu_lite_naive_average: 45.02
IFEval_Prompt-level-strict-accuracy: 56.56
drop_accuracy: 75.46
bbh_naive_average: 73.34
GPQA_diamond_accuracy: 32.83
hellaswag_accuracy: 94.81
TheoremQA_score: 23.88
musr_average_naive_average: 51.31
korbench_single_naive_average: 32
ARC_Prize_Public_Evaluation_accuracy: 0.01
gsm8k_accuracy: 86.96
GaokaoBench_weighted_average: 78.05
math_accuracy: 60.34
cmo_fib_accuracy: 12.98
aime2024_accuracy: 3.33
Mathbench_naive_average: 64.82
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
cmmlu_naive_average: 74.24
mmlu_naive_average: 70.2
mmlu_pro_naive_average: 45.39
openai_humaneval_humaneval_pass@1: 70.12
sanitized_mbpp_score: 64.59
humanevalx_naive_average: 38.78
ds1000_naive_average: 14.19
lcb_code_generation_pass@1: 16.5
lcb_code_execution_pass@1: 33.82
lcb_test_output_pass@1: 22.62
bigcodebench_hard_instruct_pass@1: 6.08
bigcodebench_hard_complete_pass@1: 6.76
teval_naive_average: 79.73
SciCode_sub_accuracy: 3.47
qa_dingo_cn_score: 100
mmlu_accuracy: 70.2
mmlu-stem_accuracy: 67.73
mmlu-social-science_accuracy: 75.49
mmlu-humanities_accuracy: 68.56
mmlu-other_accuracy: 70.58
cmmlu_accuracy: 74.24
cmmlu-stem_accuracy: 66.7
cmmlu-social-science_accuracy: 75.88
cmmlu-humanities_accuracy: 77.56
cmmlu-other_accuracy: 77.52
cmmlu-china-specific_accuracy: 73.46
mmlu_pro_accuracy: 45.39
mmlu_pro_biology_accuracy: 65.83
mmlu_pro_business_accuracy: 51.96
mmlu_pro_chemistry_accuracy: 36.84
mmlu_pro_computer_science_accuracy: 48.29
mmlu_pro_economics_accuracy: 56.16
mmlu_pro_engineering_accuracy: 29.1
mmlu_pro_health_accuracy: 44.5
mmlu_pro_history_accuracy: 42.26
mmlu_pro_law_accuracy: 24.98
mmlu_pro_math_accuracy: 54.85
mmlu_pro_philosophy_accuracy: 39.28
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.27
mmlu_pro_other_accuracy: 45.78
humanevalx-python_pass@1: 56.1
humanevalx-cpp_pass@1: 20.73
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 59.15
humanevalx-js_pass@1: 57.93
ds1000_Pandas_accuracy: 8.93
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 7.55
ds1000_Sklearn_accuracy: 7.83
ds1000_Pytorch_accuracy: 8.82
ds1000_Matplotlib_accuracy: 50.97
mmmlu_lite_accuracy: 45.02
openai_mmmlu_lite_AR-XY_accuracy: 18.6
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.23
openai_mmmlu_lite_ES-LA_accuracy: 56.63
openai_mmmlu_lite_FR-FR_accuracy: 58.11
openai_mmmlu_lite_HI-IN_accuracy: 33.82
openai_mmmlu_lite_ID-ID_accuracy: 50.39
openai_mmmlu_lite_IT-IT_accuracy: 50.39
openai_mmmlu_lite_JA-JP_accuracy: 50.95
openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_PT-BR_accuracy: 57.89
openai_mmmlu_lite_SW-KE_accuracy: 32.14
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
college_naive_average: 21
high_naive_average: 47
middle_naive_average: 59.67
primary_naive_average: 76
arithmetic_naive_average: 62
mathbench-a (average)_naive_average: 53.13
college_knowledge_naive_average: 68.99
high_knowledge_naive_average: 70.06
middle_knowledge_naive_average: 78.53
primary_knowledge_naive_average: 88.49
mathbench-t (average)_naive_average: 76.51
qwen2.5-7b-instruct-pytorch:
objective:
race-high_accuracy: 85.16
ARC-c_accuracy: 90.85
BoolQ_accuracy: 86.61
triviaqa_wiki_1shot_score: 52.96
nq_open_1shot_score: 17.62
mmmlu_lite_naive_average: 54.7
IFEval_Prompt-level-strict-accuracy: 71.35
drop_accuracy: 80.23
bbh_naive_average: 68.88
GPQA_diamond_accuracy: 36.36
hellaswag_accuracy: 85.49
TheoremQA_score: 18.38
musr_average_naive_average: 43.3
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 91.66
GaokaoBench_weighted_average: 80.02
math_accuracy: 73.74
cmo_fib_accuracy: 26.44
aime2024_accuracy: 13.33
Mathbench_naive_average: 77.08
wikibench-wiki-single_choice_cncircular_perf_4: 34
cmmlu_naive_average: 75.9
mmlu_naive_average: 76.27
mmlu_pro_naive_average: 56.14
openai_humaneval_humaneval_pass@1: 84.76
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.17
ds1000_naive_average: 18.57
lcb_code_generation_pass@1: 38.75
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.45
bigcodebench_hard_instruct_pass@1: 16.89
bigcodebench_hard_complete_pass@1: 12.16
teval_naive_average: 79.46
SciCode_sub_accuracy: 10.42
qa_dingo_cn_score: 100
mmlu_accuracy: 76.27
mmlu-stem_accuracy: 77.75
mmlu-social-science_accuracy: 78.65
mmlu-humanities_accuracy: 73.12
mmlu-other_accuracy: 75.05
cmmlu_accuracy: 75.9
cmmlu-stem_accuracy: 73.41
cmmlu-social-science_accuracy: 75.97
cmmlu-humanities_accuracy: 76.42
cmmlu-other_accuracy: 78.15
cmmlu-china-specific_accuracy: 73.27
mmlu_pro_accuracy: 56.14
mmlu_pro_biology_accuracy: 72.25
mmlu_pro_business_accuracy: 66.16
mmlu_pro_chemistry_accuracy: 55.65
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 41.38
mmlu_pro_health_accuracy: 54.89
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 29.06
mmlu_pro_math_accuracy: 73.58
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 60.05
mmlu_pro_psychology_accuracy: 61.9
mmlu_pro_other_accuracy: 52.6
humanevalx-python_pass@1: 51.83
humanevalx-cpp_pass@1: 42.68
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 73.78
humanevalx-js_pass@1: 72.56
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.64
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 8.7
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 61.29
mmmlu_lite_accuracy: 54.7
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.18
openai_mmmlu_lite_DE-DE_accuracy: 60
openai_mmmlu_lite_ES-LA_accuracy: 66.18
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 48.63
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.26
openai_mmmlu_lite_JA-JP_accuracy: 60.7
openai_mmmlu_lite_KO-KR_accuracy: 60.63
openai_mmmlu_lite_PT-BR_accuracy: 54.46
openai_mmmlu_lite_SW-KE_accuracy: 36
openai_mmmlu_lite_YO-NG_accuracy: 31.86
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
college_naive_average: 48.33
high_naive_average: 59.33
middle_naive_average: 76.67
primary_naive_average: 86.67
arithmetic_naive_average: 74.33
mathbench-a (average)_naive_average: 69.07
college_knowledge_naive_average: 83.54
high_knowledge_naive_average: 80.82
middle_knowledge_naive_average: 83.79
primary_knowledge_naive_average: 92.22
mathbench-t (average)_naive_average: 85.1
internlm3-8b-instruct-turbomind:
objective:
race-high_accuracy: 89.22
ARC-c_accuracy: 92.54
BoolQ_accuracy: 86.45
triviaqa_wiki_1shot_score: 60.72
nq_open_1shot_score: 20.25
mmmlu_lite_naive_average: 41.82
IFEval_Prompt-level-strict-accuracy: 77.45
drop_accuracy: 83.27
bbh_naive_average: 55.22
GPQA_diamond_accuracy: 37.88
hellaswag_accuracy: 91.28
TheoremQA_score: 20.12
musr_average_naive_average: 36.86
korbench_single_naive_average: 41.2
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 91.28
GaokaoBench_weighted_average: 86.59
math_accuracy: 76.96
cmo_fib_accuracy: 35.1
aime2024_accuracy: 16.67
Mathbench_naive_average: 78.96
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
cmmlu_naive_average: 83.33
mmlu_naive_average: 76.21
mmlu_pro_naive_average: 57.96
openai_humaneval_humaneval_pass@1: 81.71
sanitized_mbpp_score: 69.65
humanevalx_naive_average: 40.73
ds1000_naive_average: 27.23
lcb_code_generation_pass@1: 34.75
lcb_code_execution_pass@1: 49.9
lcb_test_output_pass@1: 48.19
bigcodebench_hard_instruct_pass@1: 13.51
bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86
SciCode_sub_accuracy: 11.11
qa_dingo_cn_score: 100
mmlu_accuracy: 76.21
mmlu-stem_accuracy: 77.7
mmlu-social-science_accuracy: 80.98
mmlu-humanities_accuracy: 70.83
mmlu-other_accuracy: 75.01
cmmlu_accuracy: 83.33
cmmlu-stem_accuracy: 79.66
cmmlu-social-science_accuracy: 83.39
cmmlu-humanities_accuracy: 84.73
cmmlu-other_accuracy: 86.2
cmmlu-china-specific_accuracy: 81.77
mmlu_pro_accuracy: 57.96
mmlu_pro_biology_accuracy: 75.45
mmlu_pro_business_accuracy: 64.64
mmlu_pro_chemistry_accuracy: 59.81
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 68.6
mmlu_pro_engineering_accuracy: 44.79
mmlu_pro_health_accuracy: 58.31
mmlu_pro_history_accuracy: 49.87
mmlu_pro_law_accuracy: 32.43
mmlu_pro_math_accuracy: 70.17
mmlu_pro_philosophy_accuracy: 46.89
mmlu_pro_physics_accuracy: 59.58
mmlu_pro_psychology_accuracy: 66.29
mmlu_pro_other_accuracy: 54.33
humanevalx-python_pass@1: 43.9
humanevalx-cpp_pass@1: 20.12
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 65.24
ds1000_Pandas_accuracy: 16.49
ds1000_Numpy_accuracy: 34.09
ds1000_Tensorflow_accuracy: 26.67
ds1000_Scipy_accuracy: 17.92
ds1000_Sklearn_accuracy: 20.87
ds1000_Pytorch_accuracy: 19.12
ds1000_Matplotlib_accuracy: 55.48
mmmlu_lite_accuracy: 41.82
openai_mmmlu_lite_AR-XY_accuracy: 32.56
openai_mmmlu_lite_BN-BD_accuracy: 4.56
openai_mmmlu_lite_DE-DE_accuracy: 24.91
openai_mmmlu_lite_ES-LA_accuracy: 51.09
openai_mmmlu_lite_FR-FR_accuracy: 61.68
openai_mmmlu_lite_HI-IN_accuracy: 24.98
openai_mmmlu_lite_ID-ID_accuracy: 44.56
openai_mmmlu_lite_IT-IT_accuracy: 52.35
openai_mmmlu_lite_JA-JP_accuracy: 51.02
openai_mmmlu_lite_KO-KR_accuracy: 47.93
openai_mmmlu_lite_PT-BR_accuracy: 53.89
openai_mmmlu_lite_SW-KE_accuracy: 33.47
openai_mmmlu_lite_YO-NG_accuracy: 33.47
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
college_naive_average: 45.67
high_naive_average: 64.67
middle_naive_average: 82.33
primary_naive_average: 90.33
arithmetic_naive_average: 74
mathbench-a (average)_naive_average: 71.4
college_knowledge_naive_average: 85.28
high_knowledge_naive_average: 79.43
middle_knowledge_naive_average: 87.9
primary_knowledge_naive_average: 93.42
mathbench-t (average)_naive_average: 86.51
internlm3-8b-instruct-pytorch:
objective:
race-high_accuracy: 89.02
ARC-c_accuracy: 93.56
BoolQ_accuracy: 86.67
triviaqa_wiki_1shot_score: 60.54
nq_open_1shot_score: 20.3
mmmlu_lite_naive_average: 42.6
IFEval_Prompt-level-strict-accuracy: 79.11
drop_accuracy: 83.32
bbh_naive_average: 54.76
GPQA_diamond_accuracy: 33.84
hellaswag_accuracy: 91.31
TheoremQA_score: 18
musr_average_naive_average: 36.62
korbench_single_naive_average: 41.84
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 90.67
GaokaoBench_weighted_average: 86.27
math_accuracy: 76.68
cmo_fib_accuracy: 33.65
aime2024_accuracy: 10
Mathbench_naive_average: 78.92
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
cmmlu_naive_average: 83.11
mmlu_naive_average: 76.23
mmlu_pro_naive_average: 58.16
openai_humaneval_humaneval_pass@1: 82.32
sanitized_mbpp_score: 70.04
humanevalx_naive_average: 39.76
ds1000_naive_average: 27.84
lcb_code_generation_pass@1: 34.5
lcb_code_execution_pass@1: 48.02
lcb_test_output_pass@1: 47.74
bigcodebench_hard_instruct_pass@1: 12.84
bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86
SciCode_sub_accuracy: 9.38
qa_dingo_cn_score: 100
mmlu_accuracy: 76.23
mmlu-stem_accuracy: 78.08
mmlu-social-science_accuracy: 80.31
mmlu-humanities_accuracy: 71.38
mmlu-other_accuracy: 74.63
cmmlu_accuracy: 83.11
cmmlu-stem_accuracy: 79.42
cmmlu-social-science_accuracy: 83.34
cmmlu-humanities_accuracy: 83.95
cmmlu-other_accuracy: 86.22
cmmlu-china-specific_accuracy: 81.5
mmlu_pro_accuracy: 58.16
mmlu_pro_biology_accuracy: 74.62
mmlu_pro_business_accuracy: 65.02
mmlu_pro_chemistry_accuracy: 60.69
mmlu_pro_computer_science_accuracy: 61.46
mmlu_pro_economics_accuracy: 68.25
mmlu_pro_engineering_accuracy: 45.3
mmlu_pro_health_accuracy: 60.15
mmlu_pro_history_accuracy: 50.66
mmlu_pro_law_accuracy: 31.7
mmlu_pro_math_accuracy: 70.32
mmlu_pro_philosophy_accuracy: 47.7
mmlu_pro_physics_accuracy: 59.51
mmlu_pro_psychology_accuracy: 65.41
mmlu_pro_other_accuracy: 53.46
humanevalx-python_pass@1: 42.68
humanevalx-cpp_pass@1: 19.51
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 72.56
humanevalx-js_pass@1: 64.02
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 35
ds1000_Tensorflow_accuracy: 24.44
ds1000_Scipy_accuracy: 20.75
ds1000_Sklearn_accuracy: 21.74
ds1000_Pytorch_accuracy: 22.06
ds1000_Matplotlib_accuracy: 56.77
mmmlu_lite_accuracy: 42.6
openai_mmmlu_lite_AR-XY_accuracy: 32.84
openai_mmmlu_lite_BN-BD_accuracy: 10.46
openai_mmmlu_lite_DE-DE_accuracy: 24.56
openai_mmmlu_lite_ES-LA_accuracy: 50.95
openai_mmmlu_lite_FR-FR_accuracy: 61.05
openai_mmmlu_lite_HI-IN_accuracy: 30.6
openai_mmmlu_lite_ID-ID_accuracy: 45.89
openai_mmmlu_lite_IT-IT_accuracy: 51.79
openai_mmmlu_lite_JA-JP_accuracy: 51.65
openai_mmmlu_lite_KO-KR_accuracy: 48.77
openai_mmmlu_lite_PT-BR_accuracy: 52.7
openai_mmmlu_lite_SW-KE_accuracy: 32.91
openai_mmmlu_lite_YO-NG_accuracy: 32.84
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
college_naive_average: 47
high_naive_average: 66.67
middle_naive_average: 81.67
primary_naive_average: 89.33
arithmetic_naive_average: 73.67
mathbench-a (average)_naive_average: 71.67
college_knowledge_naive_average: 82.91
high_knowledge_naive_average: 79.86
middle_knowledge_naive_average: 88.92
primary_knowledge_naive_average: 92.96
mathbench-t (average)_naive_average: 86.16

View File

@ -1,7 +1,7 @@
chat:
glm-4-9b-chat-hf:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
glm-4-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 90.62
@ -11,11 +11,14 @@ chat:
deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88
race-high_accuracy: 81.25
deepseek-moe-16b-chat-hf:
gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-r1-distill-llama-8b-turbomind:
gsm8k_accuracy: 31.25
race-high_accuracy: 81.25
deepseek-r1-distill-qwen-1_5b-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 53.12
deepseek-7b-chat-vllm:
gsm8k_accuracy: 50
gsm8k_accuracy: 43.75
race-high_accuracy: 78.12
gemma2-2b-it-hf:
gsm8k_accuracy: 50
@ -36,34 +39,40 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 93.75
gemma-7b-it-vllm:
gsm8k_accuracy: 46.88
gsm8k_accuracy: 31.25
race-high_accuracy: 68.75
internlm2_5-7b-chat-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm3-8b-instruct-hf:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.5
internlm2_5-7b-chat-turbomind:
gsm8k_accuracy: 87.50
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm2-chat-1.8b-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 84.38
internlm2-chat-1.8b-sft-turbomind:
gsm8k_accuracy: 21.88
gsm8k_accuracy: 31.25
race-high_accuracy: 84.38
internlm2-chat-7b-lmdeploy:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 59.38
race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 56.25
race-high_accuracy: 90.62
internlm3-8b-instruct-turbomind:
gsm8k_accuracy: 68.75
race-high_accuracy: 87.5
internlm2-chat-7b-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 84.38
gsm8k_accuracy: 59.38
race-high_accuracy: 87.50
llama-3_1-8b-instruct-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
llama-3_2-3b-instruct-hf:
gsm8k_accuracy: 68.75
gsm8k_accuracy: 71.88
race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
gsm8k_accuracy: 68.75
@ -72,14 +81,14 @@ chat:
gsm8k_accuracy: 18.75
race-high_accuracy: 46.88
llama-3_1-8b-instruct-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 65.62
gsm8k_accuracy: 68.75
race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 87.5
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
mistral-7b-instruct-v0.2-hf:
gsm8k_accuracy: 40.62
race-high_accuracy: 75
@ -94,13 +103,10 @@ chat:
race-high_accuracy: 78.12
mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
race-high_accuracy: 65.62
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 31.25
race-high_accuracy: 75
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
gsm8k_accuracy: 21.88
race-high_accuracy: 78.12
qwen2.5-0.5b-instruct-hf:
gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
@ -108,10 +114,10 @@ chat:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
qwen2.5-0.5b-instruct-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 50
gsm8k_accuracy: 31.25
race-high_accuracy: 43.75
qwen2.5-3b-instruct-turbomind:
gsm8k_accuracy: 59.38
gsm8k_accuracy: 56.25
race-high_accuracy: 90.62
qwen1.5-0.5b-chat-hf:
gsm8k_accuracy: 0
@ -123,11 +129,11 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
qwen2-1.5b-instruct-turbomind:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
qwen2-7b-instruct-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
race-high_accuracy: 87.50
qwen1.5-0.5b-chat-vllm:
gsm8k_accuracy: 3.12
race-high_accuracy: 53.12
@ -143,11 +149,11 @@ chat:
yi-1.5-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf:
gsm8k_accuracy: 46.88
deepseek-v2_lite-chat-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 71.88
gemma2-27b-it-hf:
gsm8k_accuracy: 75
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
internlm2_5-20b-chat-hf:
gsm8k_accuracy: 84.38
@ -161,6 +167,9 @@ chat:
mistral-small-instruct-2409-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
phi-4:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
qwen2.5-14b-instruct-hf:
gsm8k_accuracy: 71.88
race-high_accuracy: 96.88
@ -168,40 +177,41 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 93.75
yi-1.5-34b-chat-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 75.00
race-high_accuracy: 93.75
deepseek-67b-chat-hf:
gsm8k_accuracy: 71.88
deepseek-67b-chat-turbomind:
gsm8k_accuracy: 75.00
race-high_accuracy: 78.12
deepseek-r1-distill-qwen-32b-turbomind:
gsm8k_accuracy: 25
race-high_accuracy: 90.62
llama-3_3-70b-instruct-turbomind:
gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
mixtral-8x7b-instruct-v0.1-hf:
gsm8k_accuracy: 59.38
race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
gsm8k_accuracy: 90.62
gsm8k_accuracy: 87.50
race-high_accuracy: 93.75
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
gsm8k_accuracy: 87.5
race-high_accuracy: 46.88
gsm8k_accuracy: 93.75
race-high_accuracy: 50.00
qwen2.5-72b-instruct-turbomind:
gsm8k_accuracy: 75
race-high_accuracy: 93.75
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
deepseek-r1-distill-llama-70b-turbomind:
gsm8k_accuracy: 40.62
race-high_accuracy: 90.62
deepseek-v2_5-1210-turbomind:
gsm8k_accuracy: 90.62
race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 81.25
mixtral-8x22b-instruct-v0.1-turbomind:
gsm8k_accuracy: 78.12
race-high_accuracy: 78.12
mixtral-8x22b-instruct-v0.1-vllm:
gsm8k_accuracy: 78.12
race-high_accuracy: 78.12
base:
glm-4-9b-hf:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
glm-4-9b-turbomind:
gsm8k_accuracy: 62.5
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
@ -210,15 +220,10 @@ base:
GPQA_diamond_accuracy: 0
race-high_accuracy: 46.88
winogrande_accuracy: 71.88
deepseek-moe-16b-base-hf:
gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 0
race-high_accuracy: 21.88
winogrande_accuracy: 65.62
deepseek-7b-base-turbomind:
gsm8k_accuracy: 21.88
gsm8k_accuracy: 18.75
GPQA_diamond_accuracy: 0
race-high_accuracy: 46.88
race-high_accuracy: 43.75
winogrande_accuracy: 84.38
deepseek-moe-16b-base-vllm:
gsm8k_accuracy: 21.88
@ -245,16 +250,21 @@ base:
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 65.62
winogrande_accuracy: 71.88
gemma-2-9b-turbomind:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 0
race-high_accuracy: 18.75
winogrande_accuracy: 46.88
gemma-2b-vllm:
gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12
race-high_accuracy:
winogrande_accuracy:
race-high_accuracy: 28.12
winogrande_accuracy: 68.75
gemma-7b-vllm:
gsm8k_accuracy: 53.12
GPQA_diamond_accuracy: 9.38
race-high_accuracy:
winogrande_accuracy:
gsm8k_accuracy: 43.75
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 81.25
winogrande_accuracy: 81.25
internlm2_5-7b-hf:
gsm8k_accuracy: 37.5
GPQA_diamond_accuracy: 25
@ -265,31 +275,26 @@ base:
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 62.5
winogrande_accuracy: 78.12
internlm2-base-7b-hf:
gsm8k_accuracy: 3.12
GPQA_diamond_accuracy: 21.88
race-high_accuracy: 75
winogrande_accuracy: 65.62
internlm2-1.8b-turbomind:
gsm8k_accuracy: 12.5
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 6.25
GPQA_diamond_accuracy: 12.5
race-high_accuracy: 71.88
winogrande_accuracy: 78.12
internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 93.75
winogrande_accuracy: 87.50
internlm2-7b-turbomind:
gsm8k_accuracy: 53.12
GPQA_diamond_accuracy: 21.88
race-high_accuracy: 71.88
winogrande_accuracy: 84.38
internlm2-base-7b-turbomind:
gsm8k_accuracy: 37.50
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 81.25
winogrande_accuracy: 75
internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
internlm2-7b-turbomind:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 78.12
winogrande_accuracy: 71.88
internlm2-base-7b-turbomind:
gsm8k_accuracy: 28.12
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 71.88
winogrande_accuracy: 62.50
llama-2-7b-hf:
gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 21.88
@ -306,15 +311,15 @@ base:
race-high_accuracy: 65.62
winogrande_accuracy: 65.62
llama-3.1-8b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 78.12
winogrande_accuracy: 78.12
llama-3-8b-turbomind:
gsm8k_accuracy: 50
gsm8k_accuracy: 46.88
GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62
winogrande_accuracy: 78.12
winogrande_accuracy: 81.25
mistral-7b-v0.3-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25
@ -326,15 +331,15 @@ base:
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
qwen2.5-1.5b-turbomind:
gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 12.50
race-high_accuracy: 78.12
winogrande_accuracy: 68.75
qwen2.5-7b-turbomind:
gsm8k_accuracy: 75.00
GPQA_diamond_accuracy: 25
race-high_accuracy: 87.5
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 75
winogrande_accuracy: 71.88
qwen2.5-7b-turbomind:
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 87.5
winogrande_accuracy: 75.00
qwen1.5-moe-a2.7b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 18.75
@ -356,20 +361,20 @@ base:
race-high_accuracy: 87.5
winogrande_accuracy: 68.75
qwen2-1.5b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 6.25
race-high_accuracy: 81.25
winogrande_accuracy: 75
qwen2-7b-turbomind:
gsm8k_accuracy: 75.00
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 12.5
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
winogrande_accuracy: 75
qwen1.5-0.5b-vllm:
gsm8k_accuracy: 9.38
GPQA_diamond_accuracy: 0
race-high_accuracy: 56.25
winogrande_accuracy: 62.5
winogrande_accuracy: 59.38
yi-1.5-6b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
@ -384,25 +389,10 @@ base:
gsm8k_accuracy: 78.12
GPQA_diamond_accuracy: 40.62
race-high_accuracy: 87.5
winogrande_accuracy: 71.88
deepseek-v2-lite-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 59.38
winogrande_accuracy: 71.88
internlm2-20b-hf:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 68.75
winogrande_accuracy: 75
internlm2-base-20b-hf:
gsm8k_accuracy: 12.5
GPQA_diamond_accuracy: 9.38
race-high_accuracy: 84.38
winogrande_accuracy: 65.62
internlm2-20b-turbomind:
gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 15.62
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 68.75
winogrande_accuracy: 81.25
qwen2.5-14b-hf:
@ -420,33 +410,23 @@ base:
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75
winogrande_accuracy: 81.25
deepseek-67b-base-hf:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 81.25
winogrande_accuracy: 90.62
deepseek-67b-base-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 81.25
winogrande_accuracy: 84.38
llama-3-70b-turbomind:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 9.38
GPQA_diamond_accuracy: 34.38
race-high_accuracy: 78.12
winogrande_accuracy: 81.25
llama-3-70b-turbomind:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
qwen2.5-72b-turbomind:
gsm8k_accuracy: 84.38
GPQA_diamond_accuracy: 34.38
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
deepseek-v2-turbomind:
gsm8k_accuracy: 65.62
GPQA_diamond_accuracy: 15.62
GPQA_diamond_accuracy: 9.38
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
winogrande_accuracy: 81.25

View File

@ -44,7 +44,7 @@ on:
type: string
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
schedule:
- cron: '15 14 * * 0,2'
- cron: '15 14 * * 0,3'
env:
HF_DATASETS_OFFLINE: 1
@ -61,6 +61,7 @@ env:
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
CONDA_ENV: regression_test
export VLLM_WORKER_MULTIPROC_METHOD: spawn
jobs:
build-pypi:
@ -92,7 +93,6 @@ jobs:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
environment: 'prod'
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
@ -126,7 +126,6 @@ jobs:
if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy']
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 120 #2hours
steps:
- name: Clone repository
@ -190,7 +189,6 @@ jobs:
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
runs-on: volc_cu12_daily
environment: 'prod'
timeout-minutes: 180 #3hours
steps:
- name: Clone repository
@ -231,7 +229,6 @@ jobs:
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 480 #6hours
steps:
- name: Clone repository
@ -258,27 +255,33 @@ jobs:
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api
if: matrix.regression_func == 'api'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 180s
env | grep PROXY
env | grep proxy
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -307,7 +310,6 @@ jobs:
matrix:
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 480 #6hours
steps:
- name: Clone repository
@ -341,7 +343,6 @@ jobs:
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
timeout-minutes: 5
runs-on: self-hosted
environment: 'prod'
steps:
- name: notify
run: |

View File

@ -120,4 +120,4 @@ repos:
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
# args: ["mmocr", "tests", "tools"] # these directories will be checked

View File

@ -120,4 +120,4 @@ repos:
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
# args: ["mmocr", "tests", "tools"] # these directories will be checked

View File

@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥

View File

@ -57,8 +57,9 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
- **\[2025.02.15\]** 我们新增了两个实用的评测工具用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。

View File

@ -116,6 +116,12 @@
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: ''
- MedXpertQA:
name: MedXpertQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2501.18362
configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
- musr:
name: MuSR
category: Reasoning
@ -615,8 +621,8 @@
name: MATH
category: Math
paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/math/math_gen.py
configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
- math500:
name: MATH500
category: Math

View File

@ -49,7 +49,7 @@ export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
### ### Using LLM for Evaluation via Configuration Files
### Using LLM for Evaluation via Configuration Files
To set up an LLM judge evaluation, you'll need to configure three main components:
@ -264,6 +264,107 @@ Example evaluation output:
}
```
## CascadeEvaluator
OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
### Configuring CascadeEvaluator
Here's an example of how to configure the CascadeEvaluator:
```python
# Define a rule-based evaluator
rule_evaluator = dict(type=MATHEvaluator)
# Define an LLM judge evaluator
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=YourDataset,
path='path/to/your/dataset',
reader_cfg=reader_cfg,
),
judge_cfg=dict(), # Can use environment variables to configure the judge model
)
# Configure cascade evaluator (cascade mode)
cascade_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False # Cascade mode
)
# For parallel mode, set parallel=True
parallel_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=True # Parallel mode
)
# Use the cascade evaluator in your dataset evaluation config
eval_cfg = dict(evaluator=cascade_evaluator)
```
### Evaluation Results
The cascade evaluator outputs detailed evaluation statistics including:
- Accuracy of the rule-based evaluation
- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
- Final combined accuracy
Example output:
```python
{
'accuracy': 85.0, # Final accuracy
'cascade_stats': {
'total_samples': 100,
'rule_correct': 70, # Number of samples correct by rule evaluation
'rule_accuracy': 70.0, # Accuracy of rule evaluation
'llm_evaluated': 30, # Number of samples evaluated by LLM (failed samples in cascade mode)
'llm_correct': 15, # Number of samples correct by LLM evaluation
'llm_accuracy': 50.0, # Accuracy of LLM evaluation
'final_correct': 85, # Total correct samples
'final_accuracy': 85.0, # Final accuracy
'parallel_mode': False, # Whether parallel mode was used
},
'details': [
# Detailed evaluation results for each sample
]
}
```
The cascade evaluator is particularly useful for:
1. Scenarios that require balancing evaluation cost and accuracy
2. Cases where rule-based evaluators are available but might not be comprehensive
3. Evaluation tasks that need more nuanced judgment for edge cases
## Complete Example
For a complete working example, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving using an LLM judge.
For a complete working example using GenericLLMEvaluator
, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .

View File

@ -117,6 +117,10 @@ html_js_files = [
'js/custom.js'
]
html_context = {
'github_version': 'main',
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.

View File

@ -35,7 +35,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr'
'mmlu_pro', 'musr', 'math500'
]

View File

@ -57,7 +57,7 @@ The parameter explanation is as follows:
- `-w`: Specify the working path, default is `./outputs/default`.
- `-l`: Enable status reporting via Lark bot.
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabledevaluation under the `results` folder will include more details, such as the correctness of each sample.
- `--dump-eval-details`: Default enabledevaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。
Using run mode `-m all` as an example, the overall execution flow is as follows:

View File

@ -263,6 +263,106 @@ GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。
}
```
## 级联评估器 (CascadeEvaluator)
OpenCompass还提供了级联评估器`CascadeEvaluator`它结合了规则式评估和LLM评估的优势。级联评估器有两种模式
1. **级联模式Cascade Mode, parallel=False**首先使用规则式评估器评估所有样本然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖从而降低评估成本和时间。
2. **并行模式Parallel Mode, parallel=True**使用规则式评估器和LLM评判器同时评估所有样本如果任何一个评估器认为样本是正确的则将该样本视为正确。这种方式可以提高评估的宽容度但可能会导致更高的成本因为所有样本都需要LLM评估。
### 配置CascadeEvaluator
以下是配置`CascadeEvaluator`的示例:
```python
# 定义规则式评估器
rule_evaluator = dict(type=MATHEvaluator)
# 定义LLM评判器
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="你是一个负责评估模型输出正确性和质量的助手。",
)
],
round=[
dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=YourDataset,
path='path/to/your/dataset',
reader_cfg=reader_cfg,
),
judge_cfg=dict(), # 可以使用环境变量配置评判模型
)
# 配置级联评估器(级联模式)
cascade_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False # 级联模式
)
# 如果需要并行模式可以设置parallel=True
parallel_evaluator = dict(
type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=True # 并行模式
)
# 在数据集评估配置中使用级联评估器
eval_cfg = dict(evaluator=cascade_evaluator)
```
### 评估结果
级联评估器会输出详细的评估统计信息,包括:
- 规则评估的准确率
- LLM评估的准确率针对规则评估失败的样本
- 最终的综合准确率
输出示例:
```python
{
'accuracy': 85.0, # 最终准确率
'cascade_stats': {
'total_samples': 100,
'rule_correct': 70, # 规则评估认为正确的样本数
'rule_accuracy': 70.0, # 规则评估的准确率
'llm_evaluated': 30, # LLM评估的样本数级联模式下为规则评估失败的样本数
'llm_correct': 15, # LLM评估认为正确的样本数
'llm_accuracy': 50.0, # LLM评估的准确率
'final_correct': 85, # 最终正确的样本数
'final_accuracy': 85.0, # 最终准确率
'parallel_mode': False, # 是否是并行模式
},
'details': [
# 每个样本的详细评估结果
]
}
```
级联评估器特别适用于:
1. 需要平衡评估成本和准确性的场景
2. 有可用的规则式评估器但可能不够完善的情况
3. 需要对边界情况进行更精确判断的评估任务
## 完整示例
有关完整的工作示例请参考examples目录中的`eval_llm_judge.py`文件该文件演示了如何使用LLM评判器评估数学问题解决能力。
如果希望了解通用LLM评判器请参考examples目录中的`eval_llm_judge.py`文件该示例展示了如何使用LLM评判器评估数学问题。
如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件,该示例展示了如何使用级联评估器评估数学问题。

View File

@ -117,6 +117,10 @@ html_js_files = [
'js/custom.js'
]
html_context = {
'github_version': 'main',
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.

View File

@ -33,7 +33,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr'
'mmlu_pro', 'musr', 'math500'
]

View File

@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- `-w`: 指定工作路径,默认为 `./outputs/default`
- `-l`: 打开飞书机器人状态上报。
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
- `--dump-eval-details`: 开启`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
- `--dump-eval-details`: 默认开启,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。如不需要开启,需设置`--dump-eval-details False`。
以运行模式 `-m all` 为例,整体运行流如下:

View File

@ -0,0 +1,127 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_evaluator import MATHEvaluator
from opencompass.datasets import (
MATHDataset,
math_postprocess_v2,
normalize_final_answer,
)
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets, Summarizer
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
reader_cfg = dict(input_columns=['problem'], output_column='solution')
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
########################## Evaluator #################################
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
),
judge_cfg=dict(),
)
rule_evaluator =dict(type=MATHEvaluator)
cascade_evaluator = dict(type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False
)
########################## #################################
eval_cfg = dict()
# eval_cfg['evaluator'] = rule_evaluator
# eval_cfg['evaluator'] = llm_judge_evaluator
eval_cfg['evaluator'] = cascade_evaluator
math_datasets = [
dict(
abbr='math_prm800k_500',
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
]
datasets = math_datasets
models = lmdeploy_qwen2_5_7b_instruct_model
work_dir = 'math_prm800k_500_cascade_evaluator'

View File

@ -1,7 +1,7 @@
from mmengine.config import read_base
with read_base():
from .datasets.dingo.dingo_gen import datasets
from .models.hf_internlm.hf_internlm_7b import models
from opencompass.configs.datasets.dingo.dingo_gen import datasets
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
work_dir = './outputs/eval_dingo'

View File

@ -119,8 +119,11 @@ def parse_args():
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
'correctness of each sample, bpb, etc. Defaults to True.',
nargs='?',
const=True,
default=True,
type=lambda x: False if x and x.lower() == 'false' else True
)
parser.add_argument(
'--dump-extract-rate',
@ -233,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):
def main():
args = parse_args()
if args.num_gpus is not None:
raise ValueError('The `--num-gpus` argument is deprecated, please use '
'`--hf-num-gpus` to describe number of gpus used for '
@ -350,6 +352,9 @@ def main():
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
logger.warning('Default to dump eval details, it might take extra'
'space to save all the evaluation details. '
'Set --dump-eval-details False to skip the details dump')
cfg.eval.runner.task.dump_details = True
if args.dump_extract_rate:
cfg.eval.runner.task.cal_extract_rate = True

View File

@ -0,0 +1,57 @@
from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'medical_task',
'body_system',
'question_type',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=MedXpertQAEvaluator),
pred_role='BOT',
)
medxpertqa_dataset = dict(
type=MedXpertQADataset,
abbr='medxpertqa',
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
medxpertqa_datasets = [medxpertqa_dataset]

View File

@ -0,0 +1,104 @@
from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'medical_task',
'body_system',
'question_type',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MedXpertQADataset,
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
),
)
medxpertqa_dataset = dict(
type=MedXpertQADataset,
abbr='medxpertqa',
path='TsinghuaC3I/MedXpertQA',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
medxpertqa_datasets = [medxpertqa_dataset]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .math_gen_265cce import math_datasets # noqa: F401, F403
from .math_gen_a58d9d import math_datasets # noqa: F401, F403

View File

@ -0,0 +1,38 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset
from opencompass.openicl.icl_evaluator import MATHEvaluator
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator)
)
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]

View File

@ -1,35 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
QUERY_TEMPLATE = """
Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
{problem}
Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
""".strip()
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .math_llm_judge_gen_56606f import math_datasets # noqa: F401, F403

View File

@ -0,0 +1,85 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import MATHDataset
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:")
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=MATHDataset,
path='opencompass/math',
reader_cfg=math_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]

View File

@ -0,0 +1,22 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='mixtral-8x22b-instruct-v0.1-turbomind',
path='mistralai/Mixtral-8x22B-Instruct-v0.1',
engine_config=dict(
session_len=32768,
max_batch_size=16,
tp=8,
cache_max_entry_count=0.7,
),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,225 @@
import re
from datasets import Dataset, load_dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_logger
from .base import BaseDataset
def _parse(item, prompt_mode):
item['start'] = chr(65)
item['end'] = chr(65 + len(item.get('options', [])) - 1)
item['prompt_mode'] = prompt_mode
return item
@LOAD_DATASET.register_module()
class MedXpertQADataset(BaseDataset):
@staticmethod
def load(path: str, prompt_mode: str, **kwargs):
dataset = load_dataset(path, 'Text', split='test')
# dataset = load_dataset(path, 'Text', split='dev')
if prompt_mode == 'zero-shot':
dataset = dataset.map(lambda item: _parse(item, prompt_mode))
elif prompt_mode == 'few-shot':
pass # TODO: Implement few-shot prompt
return dataset
class MedXpertQAEvaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
method = test_set['prompt_mode'][0]
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for idx, (i, j) in enumerate(zip(predictions, references)):
i = answer_cleansing(method, i, test_set['options'][idx],
test_set['label'][idx])
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
@TEXT_POSTPROCESSORS.register_module()
def answer_cleansing(
method: str,
prediction: str,
options: list,
label: str,
) -> str:
# Clean up unwanted phrases in the prediction
for unwanted_phrase in [
'I understand',
'A through J',
'A through E',
'A through D',
]:
prediction = prediction.replace(unwanted_phrase, '')
options_num = len(options)
options = [chr(65 + i) for i in range(options_num)]
options_str = r'\b(' + '|'.join(options) + r')\b'
prediction = re.findall(options_str, prediction)
if len(prediction) == 0:
prediction = []
else:
# If there is a "label" and its length is 1,
# process prediction accordingly
if len(label) == 1:
if method == 'few-shot':
answer_flag = True if len(prediction) > 1 else False
# choose the first or last element based on the answer_flag
if answer_flag:
prediction = [prediction[0]]
else:
prediction = [prediction[-1]]
elif method == 'zero-shot':
# choose the first element in list
prediction = [prediction[0]]
else:
raise ValueError('Method is not properly defined ...')
# Remove trailing period if it exists
if prediction[0] and prediction[0].endswith('.'):
prediction[0] = prediction[0][:-1]
return prediction[0]
def _generic_llmjudge_postprocess(judgement: str):
match = re.search(r'(A|B)', judgement)
grade_letter = (match.group(0) if match else 'B'
) # Default to "INCORRECT" if no match
return grade_letter
def MedXpertQA_llmjudge_postprocess(
output: dict,
output_path: str,
dataset: Dataset,
) -> dict:
# Get the original dataset
original_dataset = dataset.reader.dataset['test']
judged_answers = []
original_responses = []
references = []
details = []
# Initialize statistics dictionaries
stats = {'medical_task': {}, 'body_system': {}, 'question_type': {}}
total_correct = 0
total_count = 0
# Process each sample
for k, v in output.items():
idx = int(k) # Convert key to integer for indexing
original_responses.append(v['prediction'])
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
# Get category information from the dataset
sample = original_dataset[idx]
medical_task = sample.get('medical_task', 'unknown')
body_system = sample.get('body_system', 'unknown')
question_type = sample.get('question_type', 'unknown')
# Initialize category stats if not exists
for level, key in [
('medical_task', medical_task),
('body_system', body_system),
('question_type', question_type),
]:
if key not in stats[level]:
stats[level][key] = {'correct': 0, 'total': 0}
# Record the judgment
if processed_judge is not None:
judged_answers.append(processed_judge)
try:
gold = v['gold']
references.append(gold)
except KeyError:
get_logger().warning(
f'No gold answer for {k}, use empty string as reference!')
gold = ''
references.append('')
# Check if the answer is correct (A means correct)
is_correct = processed_judge == 'A'
total_count += 1
if is_correct:
total_correct += 1
# Update category stats
for level, key in [
('medical_task', medical_task),
('body_system', body_system),
('question_type', question_type),
]:
stats[level][key]['correct'] += 1
# Update category totals
for level, key in [
('medical_task', medical_task),
('body_system', body_system),
('question_type', question_type),
]:
stats[level][key]['total'] += 1
# Add to details
details.append({
'id': k,
'question': sample['question'],
'options': sample['options'],
'origin_prompt': v['origin_prompt'],
'llm_judge': processed_judge,
'gold': gold,
'is_correct': is_correct,
'medical_task': medical_task,
'body_system': body_system,
'question_type': question_type,
})
# Calculate overall accuracy with two decimal places
overall_accuracy = (round(
(total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
# Initialize results dictionary
results = {
'accuracy': overall_accuracy,
'total_correct': total_correct,
'total_count': total_count,
'details': details,
}
# Calculate accuracy for each category and flatten into results
for level in stats:
for key, value in stats[level].items():
if value['total'] > 0:
# Calculate accuracy with two decimal places
accuracy = round((value['correct'] / value['total'] * 100), 2)
# Create a flattened key for the category
flat_key = f'MedXpertQA-{key}'
# Add to results
results[flat_key] = accuracy
return results

View File

@ -93,6 +93,7 @@ from .math_intern import * # noqa: F401, F403
from .mathbench import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403
from .MedXpertQA import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403
from .mmlu_cf import * # noqa: F401, F403

View File

@ -68,7 +68,7 @@ class DingoEvaluator(BaseEvaluator):
json.dump(d, f, ensure_ascii=False)
f.write('\n')
input_data = {
'eval_model': 'llm_base',
'eval_group': 'llm_base',
'input_path': file_name,
'output_path': './outputs/dingo/',
'save_data': True,

View File

@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403
from .arena_hard import arenahard_postprocess # noqa: F401, F403
from .commonbench import commonbench_postprocess
from .compass_arena import CompassArenaDataset # noqa: F401, F403
from .compass_arena import \
compassarena_bradleyterry_postprocess # noqa: F401, F403

View File

@ -0,0 +1,56 @@
# flake8: noqa: E501
import re
from collections import defaultdict
from typing import Optional
from opencompass.registry import DICT_POSTPROCESSORS
from .utils import get_judgeanswer_and_reference
def post_process(judgement: str):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
judgement = judgement['prediction']
pattern = r'\[\[([\d.]+)\]\]'
matched_result = re.findall(pattern, judgement)
if matched_result:
score = float(matched_result[0])
else:
return None
return {'score': score}
def get_capability_results(judged_answers, references):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
return capability_avg_ratings
@DICT_POSTPROCESSORS.register_module('commenbench')
def commonbench_postprocess(
output: dict,
output_path: str,
post_process: Optional[callable] = post_process,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process)
results = get_capability_results(judged_answers, references)
results['details'] = output
return results

View File

@ -1 +1,2 @@
from .cascade_evaluator import CascadeEvaluator # noqa
from .generic_llm_evaluator import GenericLLMEvaluator # noqa

View File

@ -0,0 +1,302 @@
import os
from typing import Any, Callable, Dict, List, Optional
import mmengine
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.logging import get_logger
@ICL_EVALUATORS.register_module()
class CascadeEvaluator(BaseEvaluator):
"""Cascade Evaluator.
First uses a rule-based method to judge predictions.
If a sample is marked as incorrect by the rule-based method,
then it uses an LLM judge to re-evaluate it.
Arguments:
llm_evaluator (dict): Configuration for the LLM evaluator.
rule_evaluator (Optional[dict]): Configuration for the
rule-based evaluator.
sample_score_fn (Optional[Callable]): A function to
score individual samples. If provided without rule_evaluator,
this function will be used directly.
parallel (bool): Whether to run in parallel mode.
"""
def __init__(
self,
llm_evaluator: Dict,
rule_evaluator: Optional[Dict] = None,
sample_score_fn: Optional[Callable] = None,
parallel: bool = True,
) -> None:
self.logger = get_logger()
# Initialize the LLM evaluator
llm_evaluator_type = llm_evaluator.pop('type')
if isinstance(llm_evaluator_type, str):
llm_evaluator_type = ICL_EVALUATORS.get(llm_evaluator_type)
self.llm_evaluator = llm_evaluator_type(**llm_evaluator)
# Initialize the rule evaluator if provided
self.rule_evaluator = None
if rule_evaluator:
rule_evaluator_type = rule_evaluator.pop('type')
if isinstance(rule_evaluator_type, str):
rule_evaluator_type = ICL_EVALUATORS.get(rule_evaluator_type)
self.rule_evaluator = rule_evaluator_type(**rule_evaluator)
self.sample_score_fn = sample_score_fn
self.parallel = parallel
# At least one of rule_evaluator or sample_score_fn must be provided
if not self.rule_evaluator and not self.sample_score_fn:
raise ValueError(
'Either rule_evaluator or sample_score_fn must be provided')
def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
"""Score a single sample using sample_score_fn or rule_evaluator.
Args:
prediction: The model's prediction.
reference: The ground truth.
Returns:
Dict: A dictionary containing the score and other details.
"""
if self.sample_score_fn:
# Use user-provided function to evaluate a single sample
result = self.sample_score_fn(prediction, reference)
if not isinstance(result, dict):
# Ensure result is a dictionary with at least 'correct' field
result = {
'correct': bool(result),
'pred': prediction,
'answer': reference,
}
return result
else:
# Use rule_evaluator to evaluate a single sample by calling
# the score method with single-element lists
result = self.rule_evaluator.score([prediction], [reference])
if 'details' in result and len(result['details']) > 0:
return result['details'][0]
else:
# Fallback if rule_evaluator doesn't provide detailed results
return {
'correct': result.get('accuracy', 0) > 0,
'pred': prediction,
'answer': reference,
}
def _get_llm_correctness(self, llm_detail):
"""Determine if the LLM judge considers the answer correct.
Args:
llm_detail: The evaluation details from the LLM judge.
Returns:
bool: Whether the answer is correct according to the LLM judge.
"""
if 'prediction' in llm_detail:
response = llm_detail['prediction'].strip().upper()
return response == 'A' or response.startswith('CORRECT')
elif 'correct' in llm_detail:
return llm_detail['correct']
elif 'score' in llm_detail:
return llm_detail['score'] > 0.5
return False
def score(
self,
predictions: List[str],
references: List[str],
test_set: Optional[Dataset] = None,
) -> Dict[str, Any]:
"""Score predictions using cascade or parallel evaluation.
Args:
predictions: List of model predictions.
references: List of ground truths.
test_set: Huggingface Dataset containing original test samples.
Returns:
Dict: A dictionary containing the scores and details.
"""
self.logger.info(
f"Running {'parallel' if self.parallel else 'cascade'} evaluation")
# Step 1: Evaluate each sample individually using rule-based evaluation
details = []
failed_predictions = []
failed_references = []
failed_indices = []
for i, (pred, ref) in enumerate(zip(predictions, references)):
result = self.sample_score(pred, ref)
result['evaluation_method'] = 'rule'
details.append({'rule_evaluation': result})
# If the sample failed rule-based evaluation or in parallel
# mode, mark it for LLM evaluation
if not result.get('correct', False) or self.parallel:
failed_predictions.append(pred)
failed_references.append(ref)
failed_indices.append(i)
# Calculate initial accuracy based on rule evaluation
initial_correct = sum(
1 for detail in details
if detail['rule_evaluation'].get('correct', False))
initial_accuracy = (100 * initial_correct /
len(predictions) if predictions else 0)
self.logger.info(
f'Rule-based evaluation: {initial_correct}/{len(predictions)} '
f'correct ({initial_accuracy:.2f}%)')
eval_mode = ('parallel (all samples)'
if self.parallel else 'cascade (only failed samples)')
self.logger.info(f'Samples requiring LLM evaluation ({eval_mode}): '
f'{len(failed_indices)}')
# Step 2: If there are samples for LLM evaluation
if failed_predictions and test_set is not None:
self.logger.info(f'Running LLM evaluation in {eval_mode} mode...')
# Create a subset of the test_set for LLM evaluation
failed_subset = test_set.select(failed_indices)
# Add prediction and reference columns to the dataset
failed_subset = failed_subset.add_column('prediction',
failed_predictions)
failed_subset = failed_subset.add_column('reference',
failed_references)
# Set a custom output path for LLM evaluation
original_out_dir = getattr(self.llm_evaluator, '_out_dir', None)
self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
# Check if results already exist to avoid re-evaluation
llm_results_path = f'{self.llm_evaluator._out_dir}.json'
if os.path.exists(llm_results_path):
self.logger.info(
f'Loading existing LLM evaluation results from '
f'{llm_results_path}')
llm_results = mmengine.load(llm_results_path)
# Extract details from loaded results
if llm_results.get('details', []):
loaded_details = llm_results['details']
else:
loaded_details = llm_results
# Strictly verify that the loaded results match
# the current evaluation needs
if len(loaded_details) != len(failed_indices):
error_msg = (
f'Error: Loaded LLM results contain '
f'{len(loaded_details)} samples, but current '
f'evaluation requires {len(failed_indices)} samples. '
f"The cached results at {llm_results_path} don't match"
f'the current evaluation needs. '
f'Please remove the cache file or fix the mismatch.')
self.logger.error(error_msg)
raise ValueError(error_msg)
else:
# Use GenericLLMEvaluator to evaluate samples
# unset dataset_cfg for GenericLLMEvaluator to
# directly use test_set
self.llm_evaluator.dataset_cfg = None
llm_results = self.llm_evaluator.score(
predictions=failed_predictions,
references=failed_references,
test_set=failed_subset,
)
# Restore original output directory
if original_out_dir:
self.llm_evaluator._out_dir = original_out_dir
if llm_results.get('details', []):
llm_details = llm_results['details']
else:
llm_details = llm_results
# Initialize counters for accuracy calculation
final_correct = initial_correct if not self.parallel else 0
llm_correct = 0
llm_evaluated = 0
# Update the details for samples that were evaluated by LLM
for i, llm_detail in enumerate(llm_details.values()):
original_index = failed_indices[i]
# Store original rule-based evaluation result
rule_result = details[original_index].copy()
rule_correct = rule_result['rule_evaluation'].get(
'correct', False)
# Add LLM evaluation details
details[original_index]['llm_evaluation'] = llm_detail
# Determine LLM correctness judgment and store it
is_correct = self._get_llm_correctness(llm_detail)
details[original_index]['llm_evaluation'][
'llm_correct'] = is_correct
# Count LLM evaluation statistics
llm_evaluated += 1
if is_correct:
llm_correct += 1
# Update final_correct counter based on evaluation mode
if self.parallel:
# In parallel mode, either rule-based or LLM evaluations
# should be correct
if rule_correct or is_correct:
final_correct += 1
else:
# In cascade mode, if rule was incorrect but LLM
# correct, increment
# (rule correct samples are already counted
# in initial_correct)
if not rule_correct and is_correct:
final_correct += 1
# Calculate final accuracy
final_accuracy = (100 * final_correct /
len(predictions) if predictions else 0)
llm_accuracy = (100 * llm_correct /
llm_evaluated if llm_evaluated else 0)
self.logger.info(
f'Final evaluation: {final_correct}/{len(predictions)}'
f'correct ({final_accuracy:.2f}%)')
if llm_evaluated > 0:
self.logger.info(
f'LLM evaluation: {llm_correct}/{llm_evaluated} '
f'correct ({llm_accuracy:.2f}%)')
result = {
'accuracy': final_accuracy,
'cascade_stats': {
'total_samples': len(predictions),
'rule_correct': initial_correct,
'rule_accuracy': initial_accuracy,
'llm_evaluated': llm_evaluated,
'llm_correct': llm_correct,
'llm_accuracy': llm_accuracy,
'final_correct': final_correct,
'final_accuracy': final_accuracy,
'parallel_mode': self.parallel,
},
'details': details,
}
return result

View File

@ -3,6 +3,7 @@ import os.path as osp
from typing import Dict, List, Optional
import mmengine
from datasets import Dataset
from mmengine.config import ConfigDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
@ -82,10 +83,19 @@ class GenericLLMEvaluator(BaseEvaluator):
self,
predictions,
references: Optional[List] = None,
test_set: Optional[Dataset] = None,
) -> Dict:
"""Apply to single-model scoring."""
"""Apply to single-model scoring.
Args:
predictions: List of model predictions
references: List of reference answers
test_set: Optional Dataset containing additional
context for evaluation
"""
assert len(predictions) == len(
references), 'predictions and references must have the same length'
# -------------- Build Inferencer ----------------
self.build_inferencer()
@ -93,9 +103,7 @@ class GenericLLMEvaluator(BaseEvaluator):
predictions = self.pred_postprocess(predictions)
# For Single Round Dialogue
prediction_dict = {}
prediction_dict['prediction'] = predictions
prediction_dict['obj_gold'] = references
prediction_dict = {'prediction': predictions, 'obj_gold': references}
# ---------------- Build Dataset for LLM Judge -----------------
if self.dataset_cfg:
@ -109,19 +117,42 @@ class GenericLLMEvaluator(BaseEvaluator):
dataset.reader.dataset['test'] = dataset.test.add_column(
'reference', references)
else:
# build a default dataset just for comparison
# Handle test_set in the else branch
from opencompass.datasets.lmeval import LMEvalDataset
input_columns = list(prediction_dict.keys())
if references:
input_columns.append('reference')
if test_set is not None:
# If test_set is provided, use it as the base
# Ensure necessary columns exist
if 'prediction' not in test_set.column_names:
test_set = test_set.add_column('prediction', predictions)
if 'reference' not in test_set.column_names:
test_set = test_set.add_column('reference', references)
# Prepare input_columns and data dictionary
input_columns = test_set.column_names
data_dict = {
column: test_set[column]
for column in test_set.column_names
}
else:
# Original default dataset building logic
input_columns = list(prediction_dict.keys())
if references:
input_columns.append('reference')
data_dict = prediction_dict.copy()
if references:
data_dict['reference'] = references
# Create LMEvalDataset
dataset = LMEvalDataset(
reader_cfg=dict(input_columns=input_columns,
output_column=None,
train_split='test'),
reference=references,
**prediction_dict,
reader_cfg=dict(
input_columns=input_columns,
output_column=None,
train_split='test',
),
**data_dict,
)
dataset.reader.output_column = 'reference'
retriever = ZeroRetriever(dataset)
# ----------------- LLM Judge ----------------

View File

@ -91,7 +91,8 @@ class BaseEvaluator:
):
# Check if predictions and references have the
# same length if both are provided
if 'predictions' in score_kwargs and 'references' in score_kwargs:
if ('predictions' in score_kwargs and 'references' in score_kwargs
and score_kwargs['references'] is not None):
if len(score_kwargs['predictions']) != len(
score_kwargs['references']):
raise ValueError(

View File

@ -22,26 +22,16 @@ class MATHEvaluator(BaseEvaluator):
details = []
for i, j in zip(predictions, references):
count += 1
j_with_env = f'${j}$'
gold_parsed = parse(
j,
j_with_env,
extraction_mode='first_match',
extraction_config=[
LatexExtractionConfig(),
ExprExtractionConfig(),
],
)
# If parsing result is empty, try adding LaTeX
# environment and parse again
if len(gold_parsed) == 0:
j_with_env = f'${j}$'
gold_parsed = parse(
j_with_env,
extraction_mode='first_match',
extraction_config=[
LatexExtractionConfig(),
ExprExtractionConfig(),
],
)
if len(gold_parsed) != 0:
# We require the answer to be provided in correct
# latex (no malformed operators)

View File

@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(output_file)
return {'qa_bench_' + show_dataset_abbr:json_result}

View File

@ -7,7 +7,6 @@ import random
import statistics
import sys
import time
from collections import Counter
from inspect import signature
from typing import List
@ -19,7 +18,7 @@ from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
TEXT_POSTPROCESSORS)
from opencompass.tasks.base import BaseTask, extract_role_pred
from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
get_logger, task_abbr_from_cfg)
get_logger)
@TASKS.register_module()
@ -86,6 +85,26 @@ class OpenICLEvalTask(BaseTask):
self._score()
def _score(self):
# Load and preprocess test data
test_set = self._load_and_preprocess_test_data()
# Load predictions
pred_dicts, pred_strs = self._load_predictions()
# Process predictions
pred_strs = self._process_predictions(pred_strs)
# Evaluate predictions
result = self._evaluate_predictions(
pred_strs,
test_set,
pred_dicts,
)
# Save results
self._save_results(result)
def _load_and_preprocess_test_data(self):
"""Load test dataset and apply postprocessing if needed."""
test_set = build_dataset_from_cfg(self.dataset_cfg).test
# Postprocess dataset if necessary
if 'dataset_postprocessor' in self.eval_cfg:
@ -100,7 +119,10 @@ class OpenICLEvalTask(BaseTask):
test_set = test_set.map(postprocess)
# Load predictions
return test_set
def _load_predictions(self):
"""Load model predictions from files."""
filename = get_infer_output_path(
self.model_cfg,
self.dataset_cfg,
@ -110,217 +132,188 @@ class OpenICLEvalTask(BaseTask):
root, ext = osp.splitext(filename)
partial_filename = root + '_0' + ext
# Get sc_size if use Self-Consistency
sc_size = self.eval_cfg.get('sc_size')
if not osp.exists(osp.realpath(filename)) and not osp.exists(
osp.realpath(partial_filename)):
result = {'error': 'No predictions found.'}
raise FileNotFoundError(
f'Prediction files not found: neither {filename} '
f'nor {partial_filename} exists')
if osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename)
preds = [preds[str(i)] for i in range(len(preds))]
else:
if osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename)
preds = [preds[str(i)] for i in range(len(preds))]
filename = partial_filename
preds = []
i = 1
while osp.exists(osp.realpath(filename)):
sub_preds = mmengine.load(filename)
preds.extend(
[sub_preds[str(i)] for i in range(len(sub_preds))])
filename = root + f'_{i}' + ext
i += 1
pred_dicts = copy.deepcopy(preds)
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
pred_strs = preds.pop('prediction', None)
return pred_dicts, pred_strs
def _process_predictions(self, pred_strs):
"""Apply various processing steps to predictions."""
# Check if we're dealing with a list of lists (pred_list_flag)
pred_list_flag = pred_strs is not None and isinstance(
pred_strs[0], list)
# Extract role predictions if needed
if ('pred_role' in self.eval_cfg and 'meta_template' in self.model_cfg
and not MODELS.get(self.model_cfg['type']).is_api):
# Create a prompt template for role config parsing
from opencompass.models.base import LMTemplateParser
parser = LMTemplateParser(self.model_cfg['meta_template'])
role = parser.roles[self.eval_cfg['pred_role']]
if pred_list_flag:
pred_strs = [[
extract_role_pred(
_pred,
role.get('begin', None),
role.get('end', None),
) for _pred in pred
] for pred in pred_strs]
else:
filename = partial_filename
preds = []
i = 1
while osp.exists(osp.realpath(filename)):
sub_preds = mmengine.load(filename)
preds.extend(
[sub_preds[str(i)] for i in range(len(sub_preds))])
filename = root + f'_{i}' + ext
i += 1
pred_dicts = copy.deepcopy(preds)
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
pred_strs = preds.pop('prediction', None)
pred_list_flag = pred_strs is not None and isinstance(
pred_strs[0], list)
if ('pred_role' in self.eval_cfg
and 'meta_template' in self.model_cfg
and not MODELS.get(self.model_cfg['type']).is_api):
# Create a prompt template for role config parsing
from opencompass.models.base import LMTemplateParser
parser = LMTemplateParser(self.model_cfg['meta_template'])
role = parser.roles[self.eval_cfg['pred_role']]
if sc_size is not None:
assert pred_list_flag, (
'The prediction for Self-Consistency'
'must be list.')
if pred_list_flag:
pred_strs = [[
extract_role_pred(
_pred,
role.get('begin', None),
role.get('end', None),
) for _pred in pred
] for pred in pred_strs]
else:
pred_strs = [
extract_role_pred(
pred,
role.get('begin', None),
role.get('end', None),
) for pred in pred_strs
]
# Postprocess predictions if necessary
# Model Specified Postprocessor
if 'pred_postprocessor' in self.model_cfg:
kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
pred_strs = [[proc(s, **kwargs) for s in preds]
for preds in pred_strs]
else:
pred_strs = [proc(s, **kwargs) for s in pred_strs]
# Dataset Specified Postprocessor
if 'pred_postprocessor' in self.eval_cfg:
kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
pred_strs = [[proc(s, **kwargs) for s in preds]
for preds in pred_strs]
else:
pred_strs = [proc(s, **kwargs) for s in pred_strs]
model_pred_strs = []
if 'model_postprocessor' in self.eval_cfg:
references = (test_set[self.output_column]
if self.output_column else None)
model_pred_dicts = copy.deepcopy(pred_dicts)
for i, pred_dict in enumerate(model_pred_dicts):
pred_dict['reference'] = [references[i]]
self.logger.info('Postprocessing model predictions...')
kwargs = self.eval_cfg['model_postprocessor']
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
model_pred_strs = [[
proc(model_pred_dict, **kwargs)
for model_pred_dict in model_pred_dicts
]]
else:
model_pred_strs = proc(model_pred_dicts, **kwargs)
# Get majority voting predictions if use self-consistency
if sc_size is not None:
pred_strs = [
Counter(s).most_common(1)[0][0] for s in pred_strs
extract_role_pred(
pred,
role.get('begin', None),
role.get('end', None),
) for pred in pred_strs
]
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
# need results dir to save other files
out_path = get_infer_output_path(
self.model_cfg,
self.dataset_cfg,
osp.join(self.work_dir, 'results'),
)
icl_evaluator._out_dir = osp.splitext(out_path)[
0] # strip extension
preds['predictions'] = pred_strs
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
preds['test_set'] = test_set
if 'origin_prompt' not in preds:
try:
preds['origin_prompt'] = [
None for _ in range(len(pred_strs))
]
except TypeError:
preds['origin_prompt'] = None
preds = {
k: preds[k]
for k in signature(icl_evaluator.score).parameters
}
k = self.dataset_cfg.get('k', 1)
n = self.dataset_cfg.get('n', 1)
result = icl_evaluator.evaluate(k, n, copy.deepcopy(test_set),
**preds)
# Get model postprocess result
model_details = None
model_result = None
if 'model_postprocessor' in self.eval_cfg:
model_preds = copy.deepcopy(preds)
model_preds['predictions'] = model_pred_strs
model_result = icl_evaluator.evaluate(k, n,
copy.deepcopy(test_set),
**model_preds)
for key in model_result:
if key == 'details':
model_details = model_result[key]
continue
new_key = 'model_postprocess_' + key
result[new_key] = model_result[key]
if self.dump_details:
details = result.get('details', None)
# Try to format details is details is not provided by evaluator
if details is None:
self.logger.info(
'Details is not give by evaluator, try to format it')
try:
result['details'] = self.format_details(
pred_strs,
model_pred_strs,
test_set[self.output_column],
details,
model_details,
pred_dicts,
)
self.logger.warning(
f"result['details'] : {result['details']}"),
result['type'] = result['details'].pop('type', None)
if self.cal_extract_rate:
# Calculate the extraction success
# rate for prediction
result['extract_rate'] = self.extract_rate(result)
if 'PPL' in str(
self.dataset_cfg.infer_cfg.inferencer.type):
result['correct_bpb'], result['incorrect_bpb'] = (
self.calculate_bpb(pred_dicts))
except Exception as e:
self.logger.warning(
f'Skip dumping details due to: {e}.')
# Apply postprocessors if configured
# Postprocess predictions if necessary
# Model Specified Postprocessor
if 'pred_postprocessor' in self.model_cfg:
kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
pred_strs = [[proc(s, **kwargs) for s in preds]
for preds in pred_strs]
else:
result.pop('details', None)
pred_strs = [proc(s, **kwargs) for s in pred_strs]
if 'error' in result:
self.logger.error(
f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
return
elif model_result is None:
result_wo_details = {
i: result[i]
for i in result if i != 'details'
}
self.logger.info(
f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
# Dataset Specified Postprocessor
if 'pred_postprocessor' in self.eval_cfg:
kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
pred_strs = [[proc(s, **kwargs) for s in preds]
for preds in pred_strs]
else:
pred_strs = [proc(s, **kwargs) for s in pred_strs]
return pred_strs
def _evaluate_predictions(
self,
pred_strs,
test_set,
pred_dicts,
):
"""Evaluate predictions using the configured evaluator."""
# Get references from test set
references = (None if self.output_column is None else
[sample[self.output_column] for sample in test_set])
# Build evaluator from config
evaluator_cfg = self.eval_cfg.get('evaluator', {})
evaluator_type = evaluator_cfg.get('type')
if isinstance(evaluator_type, str):
evaluator_type = ICL_EVALUATORS.get(evaluator_type)
# Prepare evaluator inputs
evaluator_cfg_copy = copy.deepcopy(evaluator_cfg)
evaluator_cfg_copy.pop('type', None)
# Initialize evaluator with appropriate parameters
sig = signature(evaluator_type)
if 'predictions' in sig.parameters and 'references' in sig.parameters:
evaluator = evaluator_type(
predictions=pred_strs,
references=references,
**evaluator_cfg_copy,
)
else:
result_wo_details = {
i: result[i]
for i in result if i != 'details'
}
model_result_wo_details = {
i: model_result[i]
for i in model_result if i != 'details'
}
self.logger.info(
f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
self.logger.info(
'Model Postprocess Task: ' +
f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
evaluator = evaluator_type(**evaluator_cfg_copy)
# Save result
# Set output directory for the evaluator
out_path = get_infer_output_path(
self.model_cfg,
self.dataset_cfg,
osp.join(self.work_dir, 'results'),
)
evaluator._out_dir = osp.splitext(out_path)[0] # strip extension
# If preds contains keys that match the score method
# parameters, include them
if pred_dicts:
preds = {
k: [pred.get(k) for pred in pred_dicts]
for k in pred_dicts[0]
}
# Add predictions and references if they're expected
# by the score method
preds['predictions'] = pred_strs
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
preds['test_set'] = test_set
if 'origin_prompt' not in preds:
try:
preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
except TypeError:
preds['origin_prompt'] = None
preds = {k: preds[k] for k in signature(evaluator.score).parameters}
# Call evaluate with the appropriate parameters
k = self.dataset_cfg.get('k', 1)
n = self.dataset_cfg.get('n', 1)
result = evaluator.evaluate(k, n, copy.deepcopy(test_set), **preds)
# Format details if needed
if self.dump_details:
# Get detailed results if available
details = result.get('details', None)
if details is None:
self.logger.info(
'Details is not give by evaluator, try to format it')
try:
result['details'] = self.format_details(
pred_strs,
references,
details,
pred_dicts,
)
# Calculate extraction rate if needed
if self.cal_extract_rate and details is not None:
result['extract_rate'] = self.extract_rate(result)
# Calculate BPB if applicable
if pred_dicts and 'BPB' in pred_dicts[0].get(
list(pred_dicts[0].keys())[0], {}):
correct_bpb, incorrect_bpb = self.calculate_bpb(
pred_dicts)
result['correct_bpb'] = correct_bpb
result['incorrect_bpb'] = incorrect_bpb
except Exception as e:
self.logger.warning(f'Skip dumping details due to: {e}.')
else:
result.pop('details', None)
return result
def _save_results(self, result):
"""Save evaluation results to file."""
out_path = get_infer_output_path(
self.model_cfg,
self.dataset_cfg,
@ -351,10 +344,8 @@ class OpenICLEvalTask(BaseTask):
def format_details(
self,
predictions,
model_pred_strs,
references,
details,
model_details,
pred_dicts,
):
"""This function is responsible for formatting prediction details.
@ -393,20 +384,6 @@ class OpenICLEvalTask(BaseTask):
result['predictions'] = str(predictions[i])
result['references'] = str(references[i])
result['correct'] = str(predictions[i]) == str(references[i])
elif details is not None and model_details is not None:
assert (
model_pred_strs != []
), 'Model details is not None, but model_pred_strs is empty'
self.logger.info(
f"model_details[i]['pred']: {model_details[i]['pred']}")
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = details[i]['pred']
result['model_extract_predictions'] = model_details[i]['pred']
result['references'] = details[i]['answer']
result['correct'] = details[i]['correct']
result['model_extract_correct'] = model_details[i]['correct']
elif details is not None:
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']

View File

@ -10,9 +10,7 @@ from .fileio import * # noqa
from .lark import * # noqa
from .logging import * # noqa
from .menu import * # noqa
from .model_postprocessors import * # noqa
from .network import * # noqa
from .postprocessors import * # noqa
from .prompt import * # noqa
from .result_station import * # noqa
from .text_postprocessors import * # noqa

View File

@ -1,135 +0,0 @@
from functools import partial
from multiprocessing import Pool
from typing import Union
from tqdm import tqdm
from opencompass.registry import TEXT_POSTPROCESSORS
from .postprocessors.naive import NaiveExtractor, format_input_naive
from .postprocessors.xfinder.extractor import Extractor
from .postprocessors.xfinder.xfinder_utils import (DataProcessor,
convert_to_xfinder_format)
def gen_output_naive(ori_data, extractor):
extracted_answers = []
for item in tqdm(ori_data):
user_input = extractor.prepare_input(item)
extracted_answer = extractor.gen_output(user_input)
item['extracted_answer'] = extracted_answer
extracted_answers.append(extracted_answer)
return extracted_answers
@TEXT_POSTPROCESSORS.register_module('naive')
def naive_model_postprocess(preds: list,
model_name: str,
custom_instruction: str,
api_url: Union[str, list],
num_processes: int = 8,
**kwargs) -> list:
"""Postprocess the text extracted by custom model.
Args:
preds (list): The question, reference answer and model prediction.
model_name (str): The name of the model.
custom_instruction (str): Custom instruction for the dataset.
url (Union[str, list]): The api url of the model.
Returns:
list: The postprocessed answers.
"""
def _eval_pred(texts, extractor, num_processes):
ori_data = texts
extracted_answers = []
batched_ori_data = []
# Split data into batches
num_processes = min(num_processes, len(ori_data))
batch_size = len(ori_data) // num_processes
for i in range(0, len(ori_data), batch_size):
batched_ori_data.append(ori_data[i:i + batch_size])
with Pool(num_processes) as p:
results = p.map(partial(gen_output_naive, extractor=extractor),
batched_ori_data)
for result in results:
extracted_answers.extend(result)
return extracted_answers
format_data = format_input_naive(preds)
assert api_url is not None, 'Please provide the api url.'
extractor = NaiveExtractor(
model_name=model_name,
custom_instruction=custom_instruction,
url=api_url.split(',') if ',' in api_url else api_url)
calc_acc_func = partial(_eval_pred,
extractor=extractor,
num_processes=num_processes)
extracted_answers = calc_acc_func(format_data)
return extracted_answers
def gen_output_xfinder(ori_data, extractor):
ext_cor_pairs = []
extracted_data = []
extracted_answers = []
for item in tqdm(ori_data):
user_input = extractor.prepare_input(item)
extracted_answer = extractor.gen_output(user_input)
ext_cor_pairs.append([
item['key_answer_type'], item['standard_answer_range'],
extracted_answer, item['correct_answer']
])
item['xfinder_extracted_answer'] = extracted_answer
extracted_answers.append(extracted_answer)
extracted_data.append(item)
return extracted_answers, ext_cor_pairs, extracted_data
@TEXT_POSTPROCESSORS.register_module('xfinder')
def xfinder_postprocess(preds: list, question_type: str, model_name: str,
api_url: Union[str, list], **kwargs) -> list:
"""Postprocess the text extracted by xFinder model.
Args:
preds (list): The question, reference answer and model prediction.
question_type (str): The type of the question.
url (Union[str, list]): The api url of the xFinder model.
Returns:
list: The postprocessed texts.
"""
def _eval_pred(texts, data_processor, extractor, num_processes=8):
ori_data = data_processor.read_data(texts)
extracted_correct_pairs = []
extracted_data = []
extracted_answers = []
batched_ori_data = []
# Split data into batches
num_processes = min(num_processes, len(ori_data))
batch_size = len(ori_data) // num_processes
for i in range(0, len(ori_data), batch_size):
batched_ori_data.append(ori_data[i:i + batch_size])
with Pool(num_processes) as p:
results = p.map(partial(gen_output_xfinder, extractor=extractor),
batched_ori_data)
for result in results:
extracted_answers += result[0]
extracted_correct_pairs += result[1]
extracted_data += result[2]
return extracted_answers
format_data = convert_to_xfinder_format(question_type, preds)
assert api_url is not None, 'Please provide the api url.'
data_processor = DataProcessor()
extractor = Extractor(
model_name=model_name,
url=api_url.split(',') if ',' in api_url else api_url)
calc_acc_func = partial(_eval_pred,
data_processor=data_processor,
extractor=extractor)
extracted_answers = calc_acc_func(format_data)
return extracted_answers

View File

@ -1,11 +0,0 @@
OPTION_NAVIE_PROMPT_TEMPLATE = """
There is a detailed explanation of the final answer you should extract:
1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
""" # noqa
MATH_NAVIE_PROMPT_TEMPLATE = """
This is a detailed explanation of the final answer you should extract:
1. The question type is a math question, so the final answer should be a number, set, vector, matrix, interval, expression, function, equation, or inequality and any combination of them.
2. If the final answer includes additional symbols, such as units, you should exclude them and only extract the pure final answer.
""" # noqa

View File

@ -1,71 +0,0 @@
## Short Usage Introduction for Naive Model Postprocessor with Custom Model
<!-- Now OC can use -->
### Step 1: Deploy an API server using vLLM or LMDeploy
```bash
lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name llama3-8b-instruct --server-port 23333 --backend turbomind --tp 1
```
### Step 2: Add Naive Model Postprocessor to the configuration file
Take GSM8K as an example, you can add the following lines to the configuration file and replace the `api_url` with the correct address of the API server.
```python
...
from opencompass.utils.model_postprocessors import navie_model_postprocess
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
...
gsm8k_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
# Add the following line to use the naive model postprocessor
model_postprocessor=dict(
type=navie_model_postprocess,
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
model_name='llama3-8b-instruct',
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
)
...
```
The prompt for extraction can also be customized by changing the `custom_instruction` parameter. Now support two default templates: `MATH_NAVIE_PROMPT_TEMPLATE` for math problems extraction like GSM8K and MATH, and `OPTION_NAVIE_PROMPT_TEMPLATE` for option problems extraction like MMLU. You can also write your own prompt template, like:
```python
OPTION_NAVIE_PROMPT_TEMPLATE = """
There is a detailed explanation of the final answer you should extract:
1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
"""
```
Your prompt should start with `There is a detailed explanation of the final answer you should extract:` and following with your customized instructions.
### Step 3: Run the Evaluation as Usual
Now you can run the evaluation as usual with the configuration file you modified. The evaluation will use the custom model as the post-process model to get the final result. The final result will be the `model_postprocess_accuracy` in the evaluation result, like:
```Markdown
dataset version metric mode llama-3-8b-instruct-turbomind
------------------------------------------------- --------- -------------------------- ------ -------------------------------
gsm8k a58960 accuracy gen 73.46
gsm8k a58960 model_postprocess_accuracy gen 78.77
```
## Experiment Results
We have tested the model postprocess method with different models (Qwen2-72B-Chat, Llama3-8b-Chat) as post-process model on the GSM8K, MMLU datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
```Markdown
| Dataset | Type | Config ID | Regex Postprocess Score | Model Postprocess Score (Llama3-8b-Instruct) | Model Postprocess Score (Qwen2-72B-Chat) |
| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |----------------------- |
| gsm8k | math | a58960 | 73.46 | 79.08 | 78.77 |
| mmlu | option | 4d595a | 67.89 | 65.26 | 67.94 |
```
The `metric` column with `model_postprocess_accuracy` is the final result after the `Naive Model Postprocessor` is applied.

View File

@ -1,2 +0,0 @@
from .extractor import * # noqa
from .PROMPT_TEMPLATE import * # noqa

View File

@ -1,121 +0,0 @@
# Naive model extractor for OpenCompass, modified from xFinder: https://github.com/IAAR-Shanghai/xFinder # noqa
import json
import time
from logging import getLogger
from openai import OpenAI
Meta_Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
Below are some special cases you need to be aware of:
(1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
(2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
(3) You should only return the precise answer you extract, without processing the answer. Please return only the answer and do not add any additional content.
""" # noqa
def format_input_naive(data):
format_data = []
for item in data:
template = {}
question = item['origin_prompt'][-1]['prompt']
llm_output = item['prediction']
correct_answer = item['reference'] if item['reference'] else item[
'gold']
template['correct_answer'] = correct_answer
template['question'] = question
template['llm_output'] = llm_output
format_data.append(template)
return format_data
class NaiveExtractor:
def __init__(
self,
model_name,
model_path=None,
url=None,
temperature=0,
max_tokens=3000,
api_key='EMPTY',
SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.', # noqa
custom_instruction=''):
self.model_name = model_name
self.SYSTEM = SYSTEM
self.model_path = model_path
self.url = url
self.api_key = api_key
self.temperature = temperature
self.max_tokens = max_tokens
self.custom_instruction = custom_instruction
self.logger = getLogger(__name__)
def prepare_input(self, item):
user_input = Meta_Instruction + self.custom_instruction + \
"Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
"Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
'Key extracted answer: '
return user_input
def gen_output(self, query):
return self.openai_infer(query)
def openai_infer(self, query: str, retry=9) -> str:
"""Perform inference on the OpenAI model.
Args:
query (str): The input query.
Returns:
str: The extracted answer (xFinder's output).
"""
if isinstance(self.url, list):
# Randomly api for better load balancing
import random
self.url = random.choice(self.url)
self.client = OpenAI(
api_key=self.api_key,
base_url=self.url,
)
self.retry = retry
t = time.time()
retry = self.retry
response = ''
while retry > 0:
try:
chat_response = self.client.chat.completions.create(
model=self.client.models.list().data[0].id
if self.model_name == '' else self.model_name,
messages=[
{
'role': 'system',
'content': self.SYSTEM
},
{
'role': 'user',
'content': query
},
],
temperature=self.temperature,
max_tokens=self.max_tokens,
)
js_response = json.loads(chat_response.model_dump_json())
response = js_response['choices'][0]['message']['content']
break
except Exception as e:
self.logger.info(f'Error: {e}')
self.logger.info(f'{self.url} is down. Retrying...')
self.logger.info(f'Time elapsed: {time.time() - t} seconds')
time.sleep(6)
retry -= 1
if retry == 0:
response = 'Error: Failed to get response.'
self.logger.info(f'{response} after {self.retry} tries.')
raise ValueError('The api is down')
return response.strip()

View File

@ -1,194 +0,0 @@
## Extract Final Answers with Postprocess Models
OpenCompass now support postprocess (extract) prediction answers with postprocess models, to get the true ability level of models. Now, we use [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first postprocess model to extract the final answers from the model outputs.
We support four types of task types now:
1. **math**: for math questions with numerical pr formula answers, like GSM8k, Math, etc.
2. **alphabet_option**: for alphabet option questions with alphabet answers, like CommonsenseQA, MMLU, etc.
3. **short_text**: for questions answer type is a short text with selected short text answers.
Here are some examples of the question type supported from the official xFinder repo:
```json
[
{
"model_name": "Llama3_8B_Instruct",
"dataset": "MetaMathQA",
"key_answer_type": "math",
"question": "Find the smallest, positive four-digit multiple of $15.$",
"llm_output": "The smallest four-digit multiple of $15$ is $\\boxed{1005}.$",
"correct_answer": "1005",
"standard_answer_range": "a(n) number / set / vector / matrix / interval / expression / function / equation / inequality"
},
{
"model_name": "LLaMA2_7B_Chat",
"dataset": "SIQA_text",
"key_answer_type": "short_text",
"question": "Jordan kept Carson up all night talking because Jordan's girlfriend left them and they were quite upset. How would Carson feel as a result? Answer Choices: drowsy / upset their girlfriend left / energetic",
"llm_output": "Based on the information provided in the question, Carson would likely feel upset because Jordan's girlfriend left them and they were quite upset. Therefore, the correct answer is (B) upset their girlfriend left.",
"correct_answer": "drowsy",
"standard_answer_range": [
"drowsy",
"upset their girlfriend left",
"energetic"
]
},
{
"model_name": "PHI2",
"dataset": "CommonsenseQA_alpha",
"key_answer_type": "alphabet_option",
"question": "Why would you go to an office? Answer Choices: (A) work (B) school building (C) paper (D) city (E) habit",
"llm_output": "A) work (B) school building (C) paper (D) city (E) habit\n\nExplanation:\n We are looking for a reason why someone would go to an office. The only answer choice that matches this is option A) work. Therefore, the correct answer is A) work.",
"correct_answer": "A",
"standard_answer_range": [
[
"A",
"work"
],
[
"B",
"school building"
],
[
"C",
"paper"
],
[
"D",
"city"
],
[
"E",
"habit"
]
]
}
]
```
## How to Use Model Postprocess in OpenCompass
### Step 1: Deploy the Postprocess Model Server
For now, there are two xFinder models can use, you can download them from Huggingface model hub:
1. **IAAR-Shanghai/xFinder-qwen1505**
2. **IAAR-Shanghai/xFinder-llama38it**
You can use LMDeploy or vLLM to deploy the xFinder model server, for example, you can use the following command to deploy the xFinder model server with LMDeploy:
```bash
lmdeploy serve api_server IAAR-Shanghai/xFinder-qwen1505 --model-name xFinder-qwen1505 --server-port 23333 --backend turbomind --tp 1
```
### Step 2: Set the Postprocess Model Config in the Dataset Configuration
We make the postprocess as a common postprocess function in OpenCompass, so you can use it by setting the `postprocess` parameter in the `predict` function of OpenCompass. It can be used with the default postprocess regularization extract function at the same time. The only thing you need to do is to deploy the postprocess model server and set the `model_postprocessor` to the original `eval_cfg` in the dataset configuration, like the following example:
```python
from opencompass.utils.model_postprocessors import xfinder_postprocess
...
model_postprocessor=dict(
type=xfinder_postprocess,
question_type='math',
xfinder_model_name='xFinder-qwen1505',
xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
```
Explanation of the parameters:
- `question_type`: the type of the question, which can be one of the three types mentioned above.
- `xfinder_model_name`: the name of the model you deploying the model server.
- `xfiner_api_url`: the URL of the model server, you can set multiple URLs with `,` to use multiple model servers, which can accelerate the postprocess speed.
📢:**Please attention following points**:
1. Now only support extract questions with Zero-shot setting.
2. For alphabet_option problems, the option should be like '\\nA. xxx\\nB. xxx\\nC. xxx\\nD. xxx\\nE. xxx\\n ...' or '\\n(A) xxx\\n(B) xxx\\n(C) xxx\\n(D) xxx\\n(E) xxx\\n ...' format, and the correct answer should be the alphabet of the correct answer, like 'A', 'B', 'C', 'D', 'E'.
For more details about the xFinder model, you can refer to the [xFinder](https://github.com/IAAR-Shanghai/xFinder), and for a complete example, you can refer to the following example, which is the configuration of the GSM8K dataset with the xFinder postprocess model:
```python
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess, Gsm8kEvaluator
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
from opencompass.utils.model_postprocessors import xfinder_postprocess
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
gsm8k_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
model_postprocessor=dict(
type=xfinder_postprocess,
question_type='math',
xfinder_model_name='xFinder-qwen1505',
xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
)
gsm8k_datasets = [
dict(
abbr='gsm8k',
type=GSM8KDataset,
path='opencompass/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg,
)
]
```
For evaluation results, `accuracy` is the result using default postprocess, and `model_postprocess_accuracy` is the result using xFinder postprocess, the gap can be wider when the model is not good answering the questions properly.
You can also use the `--dump-eval-details` command to dump the detailed evaluation details to see the model postprocess results from the `results` folder.
## Results Comparison with Different Question Types
We have tested the model postprocess method with XFinder model on the GSM8K, MMLU, Natural Questions (NQ) datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
| Dataset | Type | Config Name | Regex Postprocess Score | Model Postprocess Score |
| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |
| gsm8k | math | gsm8k_xfinder_gen_a58960 | 73.46 | 78.09 |
| nq | short_text | nq_xfinder_gen_3dcea1 | 22.33 | 37.53 |
| mmlu | alphabet_option | mmlu_xfinder_gen_4d595a | 67.89 | 67.93 |
## Citation
```bibtex
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
@misc{yu2024xfinderrobustpinpointanswer,
title={xFinder: Robust and Pinpoint Answer Extraction for Large Language Models},
author={Qingchen Yu and Zifan Zheng and Shichao Song and Zhiyu Li and Feiyu Xiong and Bo Tang and Ding Chen},
year={2024},
eprint={2405.11874},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.11874},
}
```

View File

@ -1,175 +0,0 @@
import json
import time
from logging import getLogger
import requests
from openai import OpenAI
from .xfinder_utils import PROMPT_TEMPLATE
Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
Below are some special cases you need to be aware of:
(1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
(2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
""" # noqa
class Extractor:
def __init__(
self,
model_name,
model_path=None,
url=None,
temperature=0,
max_tokens=3000,
api_key='EMPTY',
SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.' # noqa
):
self.model_name = model_name
self.PROMPT_TEMPLATE = PROMPT_TEMPLATE[model_name]
self.SYSTEM = SYSTEM
self.model_path = model_path
self.url = url
self.api_key = api_key
self.temperature = temperature
self.max_tokens = max_tokens
self.mode = 'API' if self.url is not None else 'Local'
self.logger = getLogger(__name__)
if self.mode == 'Local':
from vllm import LLM, SamplingParams
self.sampling_params = SamplingParams(temperature=self.temperature,
max_tokens=self.max_tokens,
stop=[
'<|endoftext|>',
'<|im_end|>', '<eoa>',
'<||>', '<end_of_turn>',
'<|eot_id|>'
])
self.llm = LLM(model=self.model_path, gpu_memory_utilization=0.5)
@staticmethod
def prepare_input(item):
user_input = Instruction + \
"Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
"Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
'Answer range: ' + item['standard_answer_range'] + '\n\n' + \
'Key extracted answer: '
return user_input
def gen_output(self, query):
if self.mode == 'API':
# return self.send_request(query)
return self.openai_infer(query)
else:
return self.offline_infer(query)
def send_request(self, query: str) -> str:
"""Send a request to the model's API and return the response.
Args:
query (str): The input query.
Returns:
str: The extracted answer (xFinder's output).
"""
prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
payload = json.dumps({
'prompt':
prompt,
'temperature':
self.temperature,
'max_tokens':
self.max_tokens,
'stop': [
'<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
'<end_of_turn>', '<|eot_id|>'
],
})
headers = {'Content-Type': 'application/json'}
res = requests.request('POST', self.url, headers=headers, data=payload)
res = res.json()['text'][0]
res = res.replace(prompt, '')
# res = requests.post(self.url, json=payload)
# res = res.json()['text']
res = res.strip()
return res
def openai_infer(self, query: str, retry=9) -> str:
"""Perform inference on the OpenAI model.
Args:
query (str): The input query.
Returns:
str: The extracted answer (xFinder's output).
"""
if isinstance(self.url, list):
# Randomly api for better load balancing
import random
self.url = random.choice(self.url)
self.client = OpenAI(
api_key=self.api_key,
base_url=self.url,
)
self.retry = retry
t = time.time()
retry = self.retry
response = ''
while retry > 0:
try:
chat_response = self.client.chat.completions.create(
model=self.client.models.list().data[0].id
if self.model_name == '' else self.model_name,
messages=[
{
'role': 'system',
'content': self.SYSTEM
},
{
'role': 'user',
'content': query
},
],
stop=[
'<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
'<end_of_turn>', '<|eot_id|>'
],
temperature=self.temperature,
max_tokens=self.max_tokens,
)
js_response = json.loads(chat_response.model_dump_json())
response = js_response['choices'][0]['message']['content']
break
except Exception as e:
self.logger.info(f'Error: {e}')
self.logger.info(f'{self.url} is down. Retrying...')
self.logger.info(f'Time elapsed: {time.time() - t} seconds')
time.sleep(6)
retry -= 1
if retry == 0:
response = 'Error: Failed to get response.'
self.logger.info(f'{response} after {self.retry} tries.')
raise ValueError('The api is down')
return response.strip()
def offline_infer(self, query: str) -> str:
"""Perform inference on the local xFinder model.
Args:
query (str): The input query.
Returns:
str: The extracted answer (xFinder's output).
"""
prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
res = self.llm.generate(prompt, self.sampling_params)
res = res[0]
res = res.outputs[0].text.strip()
return res

View File

@ -1,14 +0,0 @@
PROMPT_TEMPLATE = {
'xFinder-qwen1505':
"""<|System|>:{system}
<|User|>:{input}
<|Bot|>:""",
'xFinder-llama38it':
"""<|start_header_id|>system<|end_header_id|>
{system}<|eot_id|><|start_header_id|>user<|end_header_id|>
{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""",
}

View File

@ -1,3 +0,0 @@
from .convert_data import * # noqa
from .data_process import * # noqa
from .PROMPT_TEMPLATE import * # noqa

View File

@ -1,123 +0,0 @@
# Convert OpenCompass prediction data to XFinder format
import copy
import json
import re
xfinder_template = {
'math': {
'model_name':
'',
'dataset':
'',
'key_answer_type':
'math',
'question':
'',
'llm_output':
'',
'correct_answer':
'',
'standard_answer_range':
'a(n) number / set / vector / matrix / interval / expression / function / equation / inequality' # noqa
},
'alphabet_option': {
'model_name': '',
'dataset': '',
'key_answer_type': 'alphabet_option',
'question': '',
'llm_output': '.',
'correct_answer': '',
'standard_answer_range': []
},
'categorical_label': {
'model_name': '',
'dataset': '',
'key_answer_type': '',
'question': '',
'llm_output': '',
'correct_answer': '',
'standard_answer_range': []
},
'short_text': {
'model_name': '',
'dataset': '',
'key_answer_type': 'short_text',
'question': '',
'llm_output': '',
'correct_answer': '',
'standard_answer_range': []
}
}
def parse_options(text: str):
lines = text.split('\n')
parsed_options = []
option_pattern = r'^[A-Z]\)|[A-Z]\.|[A-Z]\)|[A-Z]:|\([A-Z]\)'
for line in lines:
line = line.strip()
match = re.match(option_pattern, line)
if match:
option = ''
# 等于第一个属于选项的字符
for c in line:
if c.isalpha():
option = c
break
content_start = match.end() + 1
content = line[content_start:].strip()
parsed_options.append([option, content])
return parsed_options
def convert_to_xfinder_format(typ, data, model_name='', dataset_name=''):
assert typ in xfinder_template.keys(), f'Invalid type {typ}'
format_data = []
for item in data:
template = copy.deepcopy(xfinder_template[typ])
question = item['origin_prompt'][-1]['prompt']
llm_output = item['prediction']
correct_answer = item['reference'] if item['reference'] else item[
'gold']
template['correct_answer'] = correct_answer
template['model_name'] = model_name
template['dataset'] = dataset_name
template['question'] = question
template['llm_output'] = llm_output
try:
assert typ in list(xfinder_template.keys())
if typ == 'alphabet_option':
options = parse_options(question)
template['standard_answer_range'] = options
elif typ == 'short_text':
template['standard_answer_range'] = item['gold']
elif typ == 'categorical_label':
pass
except Exception as e:
print(f'Error when parsing question options: {e}, skipping...')
continue
format_data.append(template)
return format_data
if __name__ == '__main__':
# Test
example_data = {
'origin_prompt': [{
'role':
'HUMAN',
'prompt':
'Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Ophelia, Bob is dancing with Jamie, Claire is dancing with Melissa, Dave is dancing with Rodrigo, and Eve is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Claire and Bob switch partners. Then, Claire and Eve switch partners. Then, Claire and Bob switch partners. Then, Eve and Dave switch partners. Finally, Claire and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Ophelia\n(B) Jamie\n(C) Melissa\n(D) Rodrigo\n(E) Patrick' # noqa
}],
'origin_prediction':
'\n 答案: B) 前者小于后者',
'prediction':
'B',
'reference':
'A'
}
example_data = convert_to_xfinder_format('alphabet_option', [example_data],
'GPT-3', 'OpenAI')
print(json.dumps(example_data, indent=4, ensure_ascii=False))

View File

@ -1,24 +0,0 @@
import ast
class DataProcessor:
def __init__(self):
pass
def read_data(self, data):
for item in data:
if isinstance(item['standard_answer_range'],
str) and item['key_answer_type'] != 'math':
try:
item['standard_answer_range'] = ast.literal_eval(
item['standard_answer_range'])
except Exception as e:
print(f'Error: {e}')
print('Please check the form of standard_answer_range')
exit(0)
item['standard_answer_range'] = str(item['standard_answer_range'])
item['key_answer_type'] = str(item['key_answer_type'])
return data

View File

@ -4,7 +4,7 @@ alpaca-eval==0.6
antlr4-python3-runtime==4.11
cn2an
# Dingo
dingo-python==1.1.2
dingo-python==1.5.0
# Icl topk retriever
faiss_gpu==1.7.2
# Humaneval, Humaneval X