Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2024-12-27 14:36:48 +08:00 committed by GitHub
commit b5724fc242
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 1862 additions and 1106 deletions

View File

@ -66,6 +66,8 @@ with read_base():
from opencompass.configs.summarizers.groups.mmlu_pro import \ from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501 mmlu_pro_summary_groups # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] # Only take RACE-High race_datasets = [race_datasets[1]] # Only take RACE-High
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2' humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
bbh_datasets = [ bbh_datasets = [
@ -99,61 +101,66 @@ GaokaoBench_datasets = [
] ]
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'Language',
['race-high', 'accuracy'], ['race-high', 'accuracy'],
['ARC-c', 'accuracy'], ['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'], ['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'], ['triviaqa_wiki_1shot', 'score'],
['GPQA_diamond', 'accuracy'], ['nq_open_1shot', 'score'],
['cmmlu', 'naive_average'], '',
['mmlu', 'naive_average'], 'General Reasoning',
['drop', 'accuracy'], ['drop', 'accuracy'],
['bbh', 'naive_average'], ['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['winogrande', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
['math', 'accuracy'], ['math', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'], ['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'], ['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'], ['sanitized_mbpp', 'score'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'], '',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['dingo_en_192', 'score'], ['dingo_en_192', 'score'],
['dingo_zh_170', 'score'], ['dingo_zh_170', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'', '',
'mmlu', 'mmlu',
'mmlu-stem', 'mmlu-stem',
'mmlu-social-science', 'mmlu-social-science',
'mmlu-humanities', 'mmlu-humanities',
['mmlu-other', 'accuracy'], ['mmlu-other', 'accuracy'],
'',
'cmmlu', 'cmmlu',
'cmmlu-stem', 'cmmlu-stem',
'cmmlu-social-science', 'cmmlu-social-science',
'cmmlu-humanities', 'cmmlu-humanities',
'cmmlu-other', 'cmmlu-other',
['cmmlu-china-specific', 'accuracy'], ['cmmlu-china-specific', 'accuracy'],
'',
'mmlu_pro', 'mmlu_pro',
'mmlu_pro_biology', 'mmlu_pro_biology',
'mmlu_pro_business', 'mmlu_pro_business',
@ -169,9 +176,24 @@ summarizer = dict(
'mmlu_pro_physics', 'mmlu_pro_physics',
'mmlu_pro_psychology', 'mmlu_pro_psychology',
'mmlu_pro_other', 'mmlu_pro_other',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
], ],
summary_groups=sum( summary_groups=summary_groups,
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -13,12 +13,22 @@ with read_base():
# read hf models - chat models # read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \ from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501 models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
models as lmdeploy_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
models as hf_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
models as hf_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501 models as hf_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
models as lmdeploy_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
lmdeploy_deepseek_v2_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b import \ from opencompass.configs.models.gemma.hf_gemma2_2b import \
@ -29,6 +39,8 @@ with read_base():
models as hf_gemma_2b_model # noqa: F401, E501 models as hf_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \ from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501 models as hf_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
models as lmdeploy_gemma_9b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_2b import \ from opencompass.configs.models.gemma.vllm_gemma_2b import \
models as vllm_gemma_2b_model # noqa: F401, E501 models as vllm_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b import \ from opencompass.configs.models.gemma.vllm_gemma_7b import \
@ -59,10 +71,14 @@ with read_base():
models as hf_llama3_1_8b_model # noqa: F401, E501 models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \ from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501 models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
models as hf_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501 models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
models as lmdeploy_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501 models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
@ -73,10 +89,16 @@ with read_base():
models as hf_qwen_2_5_7b_model # noqa: F401, E501 models as hf_qwen_2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
models as hf_qwen_2_5_14b_model # noqa: F401, E501 models as hf_qwen_2_5_14b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
models as hf_qwen_2_5_32b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
models as lmdeploy_qwen2_5_32b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
models as lmdeploy_qwen2_5_72b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -95,6 +117,10 @@ with read_base():
models as hf_yi_1_5_6b_model # noqa: F401, E501 models as hf_yi_1_5_6b_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b import \ from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501 models as hf_yi_1_5_9b_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -7,8 +7,6 @@ with read_base():
from opencompass.configs.datasets.race.race_gen import \ from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501 race_datasets # noqa: F401, E501
# read hf models - chat models # read hf models - chat models
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \ from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
models as hf_glm4_9b_chat_model # noqa: F401, E501 models as hf_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
@ -17,22 +15,30 @@ with read_base():
models as vllm_glm4_9b_chat_model # noqa: F401, E501 models as vllm_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501 models as hf_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
models as hf_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501 models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
models as hf_gemma2_2b_it_model # noqa: F401, E501 models as hf_gemma2_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
models as hf_gemma2_9b_it_model # noqa: F401, E501 models as hf_gemma2_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
models as hf_gemma2_27b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b_it import \ from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501 models as hf_gemma_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b_it import \ from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501 models as hf_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501 models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
models as lmdeploy_gemma_27b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
models as vllm_gemma_7b_it_model # noqa: F401, E501 models as vllm_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -65,6 +71,8 @@ with read_base():
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
models as lmdeploy_llama3_3_70b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
@ -75,6 +83,13 @@ with read_base():
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
models as \
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
@ -84,22 +99,28 @@ with read_base():
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
models as hf_minicpm3_4b_model # noqa: F401, E501 models as hf_minicpm3_4b_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \ from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
models as hf_qwen2_5_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \ from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501 models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
models as lmdeploy_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
models as lmdeploy_qwen2_5_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as lmdeploy_qwen2_5_72b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -116,6 +137,14 @@ with read_base():
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \ from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
models as lmdeploy_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
models as lmdeploy_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -7,8 +7,14 @@ with read_base():
aime2024_datasets # noqa: F401, E501 aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501 ARC_c_datasets # noqa: F401, E501
# remove because of oom
# from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501 bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
bigcodebench_hard_complete_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
bigcodebench_hard_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501 cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
@ -26,15 +32,17 @@ with read_base():
gsm8k_datasets # noqa: F401, E501 gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501 hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets # noqa: F401, E501 humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \ from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
humanevalx_datasets # noqa: F401, E501 humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets # noqa: F401, E501 ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501 LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
math_datasets # noqa: F401, E501 math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501 mathbench_datasets # noqa: F401, E501
@ -71,6 +79,7 @@ with read_base():
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
# Summary Groups # Summary Groups
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \ from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501 bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \ from opencompass.configs.summarizers.groups.cmmlu import \
@ -81,6 +90,8 @@ with read_base():
GaokaoBench_summary_groups # noqa: F401, E501 GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import \ from opencompass.configs.summarizers.groups.humanevalx import \
humanevalx_summary_groups # noqa: F401, E501 humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501 mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \ from opencompass.configs.summarizers.groups.mmlu import \
@ -96,6 +107,8 @@ with read_base():
from opencompass.configs.summarizers.mmmlu_lite import \ from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501 mmmlu_summary_groups # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
# For HumanEval-X Evaluation # For HumanEval-X Evaluation
# Apply the evaluator ip_address and port # Apply the evaluator ip_address and port
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
@ -185,6 +198,8 @@ summarizer = dict(
['hellaswag', 'accuracy'], ['hellaswag', 'accuracy'],
['TheoremQA', 'score'], ['TheoremQA', 'score'],
['musr_average', 'naive_average'], ['musr_average', 'naive_average'],
['korbench_single', 'naive_average'],
['ARC_Prize_Public_Evaluation', 'accuracy'],
'', '',
'Math Calculation', 'Math Calculation',
['gsm8k', 'accuracy'], ['gsm8k', 'accuracy'],
@ -208,6 +223,8 @@ summarizer = dict(
['lcb_code_generation', 'pass@1'], ['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'], ['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'], ['lcb_test_output', 'pass@1'],
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_hard_complete', 'pass@1'],
'', '',
'Agent', 'Agent',
['teval', 'naive_average'], ['teval', 'naive_average'],

View File

@ -0,0 +1,182 @@
from copy import deepcopy
from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
csimpleqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501
# from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
judge_models = deepcopy([models[1]])
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summary_groups = []
summary_groups.append({
'name': 'compassarena_language',
'subsets': [
['compassarena_language', '内容总结'],
],
})
summary_groups.append({
'name': 'compassarena_knowledge',
'subsets': [
['compassarena_knowledge', '生活常识_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_reason_v2',
'subsets': [
['compassarena_reason_v2', 'reasoning'],
],
})
summary_groups.append({
'name': 'compassarena_math_v2',
'subsets': [
['compassarena_math_v2', '高等数学_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_creationv2_zh',
'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'],
],
})
summary_groups.append({
'name':
'CompassArena',
'subsets': [
'compassarena_language',
'compassarena_knowledge',
'compassarena_reason_v2',
'compassarena_math_v2',
'compassarena_creationv2_zh',
],
})
summary_groups.append({
'name':
'FoFo',
'subsets': [['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall']],
})
summary_groups.append({
'name':
'Followbench',
'subsets': [
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
],
})
# Summarizer
summarizer = dict(
dataset_abbrs=[
['alignment_bench_v1_1', '总分'],
['alpaca_eval', 'total'],
['arenahard', 'score'],
['Followbench', 'naive_average'],
['CompassArena', 'naive_average'],
['FoFo', 'naive_average'],
['mtbench101', 'avg'],
['wildbench', 'average'],
['simpleqa', 'accuracy_given_attempted'],
['chinese_simpleqa', 'given_attempted_accuracy'],
'',
['alignment_bench_v1_1', '专业能力'],
['alignment_bench_v1_1', '数学计算'],
['alignment_bench_v1_1', '基本任务'],
['alignment_bench_v1_1', '逻辑推理'],
['alignment_bench_v1_1', '中文理解'],
['alignment_bench_v1_1', '文本写作'],
['alignment_bench_v1_1', '角色扮演'],
['alignment_bench_v1_1', '综合问答'],
['alpaca_eval', 'helpful_base'],
['alpaca_eval', 'koala'],
['alpaca_eval', 'oasst'],
['alpaca_eval', 'selfinstruct'],
['alpaca_eval', 'vicuna'],
['compassarena_language', 'naive_average'],
['compassarena_knowledge', 'naive_average'],
['compassarena_reason_v2', 'naive_average'],
['compassarena_math_v2', 'naive_average'],
['compassarena_creationv2_zh', 'naive_average'],
['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall'],
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
['followbench_llmeval_en', 'HSR_L1'],
['followbench_llmeval_en', 'HSR_L2'],
['followbench_llmeval_en', 'HSR_L3'],
['followbench_llmeval_en', 'HSR_L4'],
['followbench_llmeval_en', 'HSR_L5'],
['followbench_llmeval_en', 'SSR_L1'],
['followbench_llmeval_en', 'SSR_L2'],
['followbench_llmeval_en', 'SSR_L3'],
['followbench_llmeval_en', 'SSR_L4'],
['followbench_llmeval_en', 'SSR_L5'],
['simpleqa', 'f1'],
],
type=DefaultSubjectiveSummarizer,
summary_groups=summary_groups,
)

View File

@ -1,70 +0,0 @@
from copy import deepcopy
from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
judge_models = deepcopy([models[1]])
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)

View File

@ -6,47 +6,19 @@ import yaml
output_path = 'regression_result_daily' output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind', def model_list(type):
'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', with open(config_path) as f:
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind', config = yaml.load(f.read(), Loader=yaml.SafeLoader)
'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', return config.get(type).keys()
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', def dataset_list(model, type):
'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf', config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind', with open(config_path) as f:
'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', config = yaml.load(f.read(), Loader=yaml.SafeLoader)
'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf', return config.get(model).get(type).keys()
'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
'qwen2.5-14b-instruct-turbomind'
]
base_model_list = [
'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
]
@pytest.fixture() @pytest.fixture()
@ -88,35 +60,39 @@ def result_scores():
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat @pytest.mark.chat_models
class TestChat: class TestChat:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[(p1, p2) for p1 in chat_model_list 'model, dataset', [(p1, p2) for p1 in model_list('chat')
for p2 in ['gsm8k', 'race-high']]) for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get('chat').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base @pytest.mark.base_models
class TestBase: class TestBase:
"""Test cases for base model.""" """Test cases for base model."""
@pytest.mark.parametrize( @pytest.mark.parametrize('model, dataset',
'model, dataset', [(p1, p2) for p1 in model_list('base') for p2 in [
[(p1, p2) for p1 in base_model_list 'gsm8k_accuracy', 'GPQA_diamond_accuracy',
for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']]) 'race-high_accuracy', 'winogrande_accuracy'
]])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k': if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
] and dataset != 'gsm8k_accuracy':
return return
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get('base').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -130,21 +106,11 @@ class TestChatObjFullbench:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
'college', 'college_knowledge'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -158,22 +124,12 @@ class TestChatSubFullbench:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal', )
'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
'CompassArenacompassarena_knowledge',
'CompassArenacompassarena_reason_v2',
'CompassArenacompassarena_math_v2',
'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
'MTBench101average', 'Wildbenchscore'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get(
'subjective').get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -184,20 +140,15 @@ class TestChatSubFullbench:
class TestBaseFullbench: class TestBaseFullbench:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize(
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' 'model, dataset',
] for p2 in [ [(p1, p2) for p1 in
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
'college_knowledge', 'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
'mmlu_pro_math'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -209,40 +160,109 @@ class TestApibench:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('lmdeploy-api-test', 'race-middle'), [('lmdeploy-api-test', 'race-middle_accuracy'),
('lmdeploy-api-test', 'race-high'), ('lmdeploy-api-test', 'race-high_accuracy'),
('lmdeploy-api-test', 'gsm8k')]) ('lmdeploy-api-test', 'gsm8k_accuracy')])
def test_api(self, baseline_scores, result_scores, model, dataset): def test_api(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.volc_fullbench
class TestVolcFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize('model, dataset', [
(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
])
@pytest.mark.chat_subjective
def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(
'subjective').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
@pytest.mark.base_objective
def test_base_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
@pytest.mark.base_long_context
def test_base_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(
'long_context').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2)
for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
@pytest.mark.chat_long_context
def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(
'long_context').get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores') @pytest.mark.usefixtures('baseline_scores')
class TestCmdCase: class TestCmdCase:
@pytest.mark.case1 @pytest.mark.case1
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'), [('internlm2_5-7b-hf', 'race-middle_accuracy'),
('internlm2_5-7b-hf', 'race-high'), ('internlm2_5-7b-hf', 'race-high_accuracy'),
('internlm2_5-7b-hf', 'demo_gsm8k'), ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
('internlm2-1.8b-hf', 'race-middle'), ('internlm2-1.8b-hf', 'race-middle_accuracy'),
('internlm2-1.8b-hf', 'race-high'), ('internlm2-1.8b-hf', 'race-high_accuracy'),
('internlm2-1.8b-hf', 'demo_gsm8k')]) ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.case2 @pytest.mark.case2
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[('internlm2_5-7b-chat-lmdeploy', 'race-middle'), 'model, dataset',
('internlm2_5-7b-chat-lmdeploy', 'race-high'), [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'), ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle'), ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-high'), ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')]) ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
@ -250,19 +270,19 @@ class TestCmdCase:
@pytest.mark.case3 @pytest.mark.case3
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'), [('internlm2_5-7b_hf', 'race-middle_accuracy'),
('internlm2_5-7b_hf', 'race-high'), ('internlm2_5-7b_hf', 'race-high_accuracy'),
('internlm2_5-7b_hf', 'demo_gsm8k')]) ('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.case4 @pytest.mark.case4
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[('internlm2_5-7b-chat_hf', 'race-middle'), 'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
('internlm2_5-7b-chat_hf', 'race-high'), ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k')]) ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
@ -310,8 +330,7 @@ def find_csv_files(directory):
csv_files = [] csv_files = []
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
for file in files: for file in files:
if file.endswith('.csv') and (file.startswith('summary') or if file.endswith('.csv') and file.startswith('summary'):
file.startswith('Subjective_all')):
csv_files.append(os.path.join(root, file)) csv_files.append(os.path.join(root, file))
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -324,24 +343,15 @@ def read_csv_file(file_path):
with open(file_path, 'r') as csvfile: with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile) reader = csv.DictReader(csvfile)
filtered_data = [] filtered_data = []
if 'Subjective_all' not in file_path: for row in reader:
for row in reader: if row['metric'] is not None and 'bpb' not in row[
if row['metric'] is not None and 'bpb' not in row['metric']: 'metric'] and '_' != row['metric']:
filtered_row = { filtered_row = row
k: v filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
for k, v in row.items() del filtered_row['version']
if k not in ['version', 'metric', 'mode'] del filtered_row['metric']
} del filtered_row['mode']
filtered_data.append(filtered_row) filtered_data.append(filtered_row)
else:
for row in reader:
if row['Detailed Scores'] is not None:
filtered_row = row
filtered_row['dataset'] = filtered_row[
'Dataset'] + filtered_row['Detailed Scores']
del filtered_row['Dataset']
del filtered_row['Detailed Scores']
filtered_data.append(filtered_row)
result = {} result = {}
for data in filtered_data: for data in filtered_data:

View File

@ -1,34 +1,34 @@
internlm2_5-7b-hf: internlm2_5-7b-hf:
demo_gsm8k: 42.19 demo_gsm8k_accuracy: 42.19
race-middle: 91.78 race-middle_accuracy: 91.78
race-high: 90.02 race-high_accuracy: 90.02
internlm2_5-7b_hf: internlm2_5-7b_hf:
demo_gsm8k: 42.19 demo_gsm8k_accuracy: 42.19
race-middle: 91.78 race-middle_accuracy: 91.78
race-high: 90.02 race-high_accuracy: 90.02
internlm2-1.8b-hf: internlm2-1.8b-hf:
demo_gsm8k: 15.62 demo_gsm8k_accuracy: 15.62
race-middle: 71.66 race-middle_accuracy: 71.66
race-high: 66.38 race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy: internlm2_5-7b-chat-lmdeploy:
demo_gsm8k: 84.38 demo_gsm8k_accuracy: 89.06
race-middle: 92.76 race-middle_accuracy: 92.76
race-high: 90.54 race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy: internlm2-chat-1.8b-lmdeploy:
demo_gsm8k: 31 demo_gsm8k_accuracy: 32
race-middle: 81.34 race-middle_accuracy: 81.34
race-high: 73.96 race-high_accuracy: 73.96
internlm2_5-7b-chat_hf: internlm2_5-7b-chat_hf:
demo_gsm8k: 87.50 demo_gsm8k_accuracy: 87.50
race-middle: 92.76 race-middle_accuracy: 92.76
race-high: 90.48 race-high_accuracy: 90.48
lmdeploy-api-test: lmdeploy-api-test:
gsm8k: 83.78 gsm8k_accuracy: 83.78
race-middle: 92.41 race-middle_accuracy: 92.41
race-high: 90.37 race-high_accuracy: 90.37

View File

@ -1,173 +1,456 @@
internlm2_5-7b-chat-hf_fullbench: internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75 objective:
ARC-c: 93.75 race-high_accuracy: 93.75
BoolQ: 81.25 ARC-c_accuracy: 93.75
triviaqa_wiki_1shot: 50 BoolQ_accuracy: 81.25
nq_open_1shot: 25 triviaqa_wiki_1shot_score: 50
IFEval: 50 nq_open_1shot_score: 25
drop: 81.25 IFEval_Prompt-level-strict-accuracy: 50
GPQA_diamond: 25 drop_accuracy: 81.25
hellaswag: 87.5 GPQA_diamond_accuracy: 25
TheoremQA: 18.75 hellaswag_accuracy: 87.5
musr_average: 39.58 TheoremQA_score: 18.75
gsm8k: 56.25 musr_average_naive_average: 39.58
math: 75 korbench_single_naive_average: 40
cmo_fib: 6.25 gsm8k_accuracy: 62.50
aime2024: 6.25 math_accuracy: 75
wikibench-wiki-single_choice_cncircular: 50 cmo_fib_accuracy: 6.25
sanitized_mbpp: 68.75 aime2024_accuracy: 6.25
ds1000: 16.96 wikibench-wiki-single_choice_cncircular_perf_4: 50
lcb_code_generation: 12.5 sanitized_mbpp_score: 68.75
lcb_code_execution: 43.75 ds1000_naive_average: 16.96
lcb_test_output: 18.75 lcb_code_generation_pass@1: 12.5
bbh-logical_deduction_seven_objects: 50 lcb_code_execution_pass@1: 43.75
bbh-multistep_arithmetic_two: 68.75 lcb_test_output_pass@1: 18.75
mmlu-other: 72.6 bbh-logical_deduction_seven_objects_score: 50
cmmlu-china-specific: 76.25 bbh-multistep_arithmetic_two_score: 68.75
mmlu_pro_math: 25 mmlu-other_naive_average: 72.6
ds1000_Pandas: 12.5 cmmlu-china-specific_naive_average: 76.25
ds1000_Numpy: 0 mmlu_pro_math_accuracy: 25
ds1000_Tensorflow: 12.5 ds1000_Pandas_accuracy: 12.5
ds1000_Scipy: 18.75 ds1000_Numpy_accuracy: 0
ds1000_Sklearn: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Pytorch: 12.5 ds1000_Scipy_accuracy: 18.75
ds1000_Matplotlib: 43.75 ds1000_Sklearn_accuracy: 18.75
openai_mmmlu_lite_AR-XY: 37.5 ds1000_Pytorch_accuracy: 12.5
college: 12.5 ds1000_Matplotlib_accuracy: 43.75
college_knowledge: 87.5 openai_mmmlu_lite_AR-XY_accuracy: 37.5
Alignbench总分: 0.65 college_naive_average: 12.5
Alignbench专业能力: 7.83 college_knowledge_naive_average: 87.5
AlpacaEvaltotal: 0 subjective:
AlpacaEvalhelpful_base: 0 alignment_bench_v1_1_总分: 0.66
CompassArenacompassarena_language: 60 alpaca_eval_total: 20
CompassArenacompassarena_knowledge: 56 arenahard_score: 50
CompassArenacompassarena_reason_v2: 50 Followbench_naive_average: 1
CompassArenacompassarena_math_v2: 53.5 CompassArena_naive_average: 44.00
CompassArenacompassarena_creationv2_zh: 48.75 mtbench101_avg: 7.8
Fofofofo_test_prompts: 1 wildbench_average: -12.78
followbenchHSR_AVG: 1 simpleqa_accuracy_given_attempted: 0
followbenchSSR_AVG: 1 chinese_simpleqa_given_attempted_accuracy: 1
followbenchHSR_L1: 1 alignment_bench_v1_1_专业能力: 7.90
followbenchHSR_L2: 1 alignment_bench_v1_1_数学计算: 0
followbenchHSR_L3: 1 alignment_bench_v1_1_基本任务: 0
followbenchHSR_L4: 1 alignment_bench_v1_1_逻辑推理: 0
followbenchHSR_L5: 1 alignment_bench_v1_1_中文理解: 0
followbenchSSR_L1: 1 alignment_bench_v1_1_文本写作: 0
followbenchSSR_L2: 1 alignment_bench_v1_1_角色扮演: 0
followbenchSSR_L3: 1 alignment_bench_v1_1_综合问答: 0
followbenchSSR_L4: 1 alpaca_eval_helpful_base: 20
followbenchSSR_L5: 1 compassarena_language_naive_average: 35
MTBench101average: 8.1 compassarena_knowledge_naive_average: 55
Wildbenchscore: -3.3333333333333335 compassarena_reason_v2_naive_average: 45.00
compassarena_math_v2_naive_average: 55
compassarena_creationv2_zh_naive_average: 30
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-chat-turbomind_fullbench: internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75 objective:
ARC-c: 87.5 race-high_accuracy: 93.75
BoolQ: 68.75 ARC-c_accuracy: 93.75
triviaqa_wiki_1shot: 50 BoolQ_accuracy: 68.75
nq_open_1shot: 25 triviaqa_wiki_1shot_score: 50
IFEval: 50 nq_open_1shot_score: 25
drop: 75 IFEval_Prompt-level-strict-accuracy: 56.25
hellaswag: 81.25 drop_accuracy: 81.25
TheoremQA: 6.25 GPQA_diamond_accuracy: 31.25
musr_average: 37.5 hellaswag_accuracy: 81.25
gsm8k: 68.75 TheoremQA_score: 6.25
math: 75 musr_average_naive_average: 39.58
GPQA_diamond: 25 korbench_single_naive_average: 37.50
cmo_fib: 6.25 gsm8k_accuracy: 68.75
aime2024: 6.25 math_accuracy: 68.75
wikibench-wiki-single_choice_cncircular: 25 cmo_fib_accuracy: 6.25
sanitized_mbpp: 68.75 aime2024_accuracy: 6.25
ds1000: 13.39 wikibench-wiki-single_choice_cncircular_perf_4: 50.00
lcb_code_generation: 12.5 sanitized_mbpp_score: 68.75
lcb_code_execution: 43.75 ds1000_naive_average: 16.96
lcb_test_output: 12.5 lcb_code_generation_pass@1: 12.5
bbh-logical_deduction_seven_objects: 56.25 lcb_code_execution_pass@1: 43.75
bbh-multistep_arithmetic_two: 68.75 lcb_test_output_pass@1: 25.00
mmlu-other: 74.04 bbh-logical_deduction_seven_objects_score: 50.00
cmmlu-china-specific: 76.25 bbh-multistep_arithmetic_two_score: 68.75
mmlu_pro_math: 25 mmlu-other_naive_average: 69.71
ds1000_Pandas: 0 cmmlu-china-specific_naive_average: 75.83
ds1000_Numpy: 0 mmlu_pro_math_accuracy: 31.25
ds1000_Tensorflow: 12.5 ds1000_Pandas_accuracy: 0
ds1000_Scipy: 18.75 ds1000_Numpy_accuracy: 0
ds1000_Sklearn: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Pytorch: 6.25 ds1000_Scipy_accuracy: 18.75
ds1000_Matplotlib: 37.5 ds1000_Sklearn_accuracy: 18.75
openai_mmmlu_lite_AR-XY: 37.5 ds1000_Pytorch_accuracy: 18.75
college: 0 ds1000_Matplotlib_accuracy: 50.00
college_knowledge: 87.5 openai_mmmlu_lite_AR-XY_accuracy: 37.5
Alignbench总分: 0.64 college_naive_average: 12.50
Alignbench专业能力: 7.6 college_knowledge_naive_average: 87.5
AlpacaEvaltotal: 10 subjective:
AlpacaEvalhelpful_base: 10 alignment_bench_v1_1_总分: 0.70
CompassArenacompassarena_language: 59 alpaca_eval_total: 0
CompassArenacompassarena_knowledge: 57 arenahard_score: 50
CompassArenacompassarena_reason_v2: 49.5 Followbench_naive_average: 1
CompassArenacompassarena_math_v2: 51 CompassArena_naive_average: 38
CompassArenacompassarena_creationv2_zh: 43.75 mtbench101_avg: 7.80
Fofofofo_test_prompts: 1 wildbench_average: -4.86
followbenchHSR_AVG: 1 simpleqa_accuracy_given_attempted: 0
followbenchSSR_AVG: 1 chinese_simpleqa_given_attempted_accuracy: 1
followbenchHSR_L1: 1 alignment_bench_v1_1_专业能力: 8.4
followbenchHSR_L2: 1 alignment_bench_v1_1_数学计算: 0
followbenchHSR_L3: 1 alignment_bench_v1_1_基本任务: 0
followbenchHSR_L4: 1 alignment_bench_v1_1_逻辑推理: 0
followbenchHSR_L5: 1 alignment_bench_v1_1_中文理解: 0
followbenchSSR_L1: 1 alignment_bench_v1_1_文本写作: 0
followbenchSSR_L2: 1 alignment_bench_v1_1_角色扮演: 0
followbenchSSR_L3: 1 alignment_bench_v1_1_综合问答: 0
followbenchSSR_L4: 1 alpaca_eval_helpful_base: 0
followbenchSSR_L5: 1 compassarena_language_naive_average: 35
MTBench101average: 8.1 compassarena_knowledge_naive_average: 50
Wildbenchscore: -8.333333333333334 compassarena_reason_v2_naive_average: 30
compassarena_math_v2_naive_average: 50
compassarena_creationv2_zh_naive_average: 25
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-hf_fullbench: internlm2_5-7b-hf_fullbench:
race-high: 100 objective:
ARC-c: 68.75 race-high_accuracy: 100
BoolQ: 87.5 ARC-c_accuracy: 68.75
GPQA_diamond: 62.5 BoolQ_accuracy: 87.5
drop: 62.5 triviaqa_wiki_1shot_score: 43.75
math: 12.5 nq_open_1shot_score: 43.75
wikibench-wiki-single_choice_cncircular: 25 drop_accuracy: 62.5
sanitized_mbpp: 56.25 GPQA_diamond_accuracy: 62.5
gsm8k: 37.5 hellaswag_accuracy: 93.75
triviaqa_wiki_1shot: 43.75 TheoremQA_score: 25
nq_open_1shot: 43.75 winogrande_accuracy: 75
winogrande: 75 gsm8k_accuracy: 37.5
hellaswag: 93.75 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
TheoremQA: 25 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
dingo_en_192: 37.5 math_accuracy: 12.5
dingo_zh_170: 100 wikibench-wiki-single_choice_cncircular_perf_4: 25
college: 12.5 sanitized_mbpp_score: 56.25
college_knowledge: 87.5 dingo_en_192_score: 37.5
bbh-logical_deduction_seven_objects: 43.75 dingo_zh_170_score: 100
bbh-multistep_arithmetic_two: 56.25 mmlu-other_accuracy: 76.92
mmlu-other: 76.92 cmmlu-china-specific_accuracy: 84.17
cmmlu-china-specific: 84.17 mmlu_pro_math_accuracy: 18.75
mmlu_pro_math: 18.75 bbh-logical_deduction_seven_objects_score: 43.75
bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind_fullbench: internlm2_5-7b-turbomind_fullbench:
race-high: 100 objective:
ARC-c: 68.75 race-high_accuracy: 100
BoolQ: 87.5 ARC-c_accuracy: 68.75
GPQA_diamond: 62.5 BoolQ_accuracy: 87.5
drop: 62.5 triviaqa_wiki_1shot_score: 43.75
math: 18.75 nq_open_1shot_score: 43.75
wikibench-wiki-single_choice_cncircular: 25 drop_accuracy: 62.5
sanitized_mbpp: 56.25 GPQA_diamond_accuracy: 62.5
gsm8k: 68.75 hellaswag_accuracy: 93.75
triviaqa_wiki_1shot: 43.75 TheoremQA_score: 25.00
nq_open_1shot: 43.75 winogrande_accuracy: 87.5
winogrande: 87.5 gsm8k_accuracy: 62.50
hellaswag: 93.75 GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
TheoremQA: 31.25 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
dingo_en_192: 43.75 math_accuracy: 18.75
dingo_zh_170: 100 wikibench-wiki-single_choice_cncircular_perf_4: 25
college: 12.5 sanitized_mbpp_score: 62.50
college_knowledge: 87.5 dingo_en_192_score: 31.25
bbh-logical_deduction_seven_objects: 50 dingo_zh_170_score: 93.75
bbh-multistep_arithmetic_two: 56.25 mmlu-other_accuracy: 76.92
mmlu-other: 76.92 cmmlu-china-specific_accuracy: 84.17
cmmlu-china-specific: 84.17 mmlu_pro_math_accuracy: 18.75
mmlu_pro_math: 18.75 bbh-logical_deduction_seven_objects_score: 50
bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind:
objective:
race-high_accuracy: 89.28
ARC-c_accuracy: 52.2
BoolQ_accuracy: 89.72
triviaqa_wiki_1shot_score: 65.88
nq_open_1shot_score: 34.82
drop_accuracy: 68.1
bbh_naive_average: 72.15
GPQA_diamond_accuracy: 32.83
hellaswag_accuracy: 88.36
TheoremQA_score: 25
winogrande_accuracy: 81.29
gsm8k_accuracy: 74.68
GaokaoBench_weighted_average: 58.19
math_accuracy: 33.98
Mathbench_naive_average: 48.38
wikibench-wiki-single_choice_cncircular_perf_4: 29.1
cmmlu_naive_average: 78.94
mmlu_naive_average: 71.44
mmlu_pro_naive_average: 38.18
openai_humaneval_humaneval_pass@1: 59.76
openai_humaneval_v2_humaneval_pass@1: 51.22
sanitized_mbpp_score: 55.25
dingo_en_192_score: 60.94
dingo_zh_170_score: 67.65
mmlu-stem_naive_average: 63.72
mmlu-social-science_naive_average: 80.15
mmlu-humanities_naive_average: 74.27
mmlu-other_naive_average: 71.85
cmmlu-stem_naive_average: 67.07
cmmlu-social-science_naive_average: 81.49
cmmlu-humanities_naive_average: 85.84
cmmlu-other_naive_average: 82.69
cmmlu-china-specific_naive_average: 79.88
mmlu_pro_biology_accuracy: 58.58
mmlu_pro_business_accuracy: 28.01
mmlu_pro_chemistry_accuracy: 22.79
mmlu_pro_computer_science_accuracy: 39.02
mmlu_pro_economics_accuracy: 53.08
mmlu_pro_engineering_accuracy: 25.7
mmlu_pro_health_accuracy: 46.94
mmlu_pro_history_accuracy: 43.04
mmlu_pro_law_accuracy: 29.7
mmlu_pro_math_accuracy: 24.2
mmlu_pro_philosophy_accuracy: 42.48
mmlu_pro_physics_accuracy: 26.02
mmlu_pro_psychology_accuracy: 52.76
mmlu_pro_other_accuracy: 42.21
college_naive_average: 10.67
high_naive_average: 6.67
middle_naive_average: 26.67
primary_naive_average: 60
arithmetic_naive_average: 55
mathbench-a (average)_naive_average: 31.8
college_knowledge_naive_average: 62.34
high_knowledge_naive_average: 59.83
middle_knowledge_naive_average: 71.15
primary_knowledge_naive_average: 66.55
mathbench-t (average)_naive_average: 64.97
long_context:
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
Single-Needle-Retrieval-EN-32000_naive_average: 100
Single-Needle-Retrieval-ZH-32000_naive_average: 100
Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
Single-Needle-Retrieval-EN-100000_naive_average: 100
Single-Needle-Retrieval-ZH-100000_naive_average: 100
Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
Single-Needle-Retrieval-EN-200000_naive_average: 100
Single-Needle-Retrieval-ZH-200000_naive_average: 100
longbench_naive_average: 46.19
longbench_zh_naive_average: 49.3
longbench_en_naive_average: 43.97
longbench_single-document-qa_naive_average: 42.84
longbench_multi-document-qa_naive_average: 37.29
longbench_summarization_naive_average: 23.21
longbench_few-shot-learning_naive_average: 61.67
longbench_synthetic-tasks_naive_average: 60.05
longbench_code-completion_naive_average: 52.09
internlm2_5-7b-chat-turbomind:
objective:
race-high_accuracy: 86.16
ARC-c_accuracy: 90.17
BoolQ_accuracy: 87.89
triviaqa_wiki_1shot_score: 64.91
nq_open_1shot_score: 22.69
mmmlu_lite_naive_average: 44.96
IFEval_Prompt-level-strict-accuracy: 58.04
drop_accuracy: 77.68
bbh_naive_average: 73.14
GPQA_diamond_accuracy: 25.76
hellaswag_accuracy: 94.79
TheoremQA_score: 21.5
musr_average_naive_average: 51.03
korbench_single_naive_average: 31.92
ARC_Prize_Public_Evaluation_accuracy: 0.01
gsm8k_accuracy: 86.73
GaokaoBench_weighted_average: 77.89
math_accuracy: 61.5
cmo_fib_accuracy: 12.5
aime2024_accuracy: 3.33
Mathbench_naive_average: 65.17
wikibench-wiki-single_choice_cncircular_perf_4: 31.55
cmmlu_naive_average: 74.14
mmlu_naive_average: 70.52
mmlu_pro_naive_average: 44.98
openai_humaneval_humaneval_pass@1: 70.73
sanitized_mbpp_score: 63.81
humanevalx_naive_average: 38.17
ds1000_naive_average: 14.15
lcb_code_generation_pass@1: 17.75
lcb_code_execution_pass@1: 32.57
lcb_test_output_pass@1: 24.89
bigcodebench_hard_instruct_pass@1: 0.08
bigcodebench_hard_complete_pass@1: 0.06
teval_naive_average: 80.03
qa_dingo_cn_score: 99.01
mmlu-stem_naive_average: 68.2
mmlu-social-science_naive_average: 76.11
mmlu-humanities_naive_average: 68.71
mmlu-other_naive_average: 70.56
cmmlu-stem_naive_average: 66.27
cmmlu-social-science_naive_average: 75.7
cmmlu-humanities_naive_average: 77.7
cmmlu-other_naive_average: 77.71
cmmlu-china-specific_naive_average: 72.94
mmlu_pro_biology_accuracy: 66.25
mmlu_pro_business_accuracy: 48.42
mmlu_pro_chemistry_accuracy: 35.25
mmlu_pro_computer_science_accuracy: 47.56
mmlu_pro_economics_accuracy: 55.92
mmlu_pro_engineering_accuracy: 30.44
mmlu_pro_health_accuracy: 45.97
mmlu_pro_history_accuracy: 41.21
mmlu_pro_law_accuracy: 25.79
mmlu_pro_math_accuracy: 54.03
mmlu_pro_philosophy_accuracy: 36.47
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.77
mmlu_pro_other_accuracy: 46.21
humanevalx-python_pass@1: 53.66
humanevalx-cpp_pass@1: 24.39
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 57.93
humanevalx-js_pass@1: 54.88
ds1000_Pandas_accuracy: 12.03
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 8.49
ds1000_Sklearn_accuracy: 6.96
ds1000_Pytorch_accuracy: 7.35
ds1000_Matplotlib_accuracy: 49.03
openai_mmmlu_lite_AR-XY_accuracy: 17.89
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.16
openai_mmmlu_lite_ES-LA_accuracy: 56.84
openai_mmmlu_lite_FR-FR_accuracy: 57.96
openai_mmmlu_lite_HI-IN_accuracy: 33.68
openai_mmmlu_lite_ID-ID_accuracy: 51.02
openai_mmmlu_lite_IT-IT_accuracy: 50.46
openai_mmmlu_lite_JA-JP_accuracy: 50.53
openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_PT-BR_accuracy: 57.68
openai_mmmlu_lite_SW-KE_accuracy: 32.77
openai_mmmlu_lite_YO-NG_accuracy: 31.79
openai_mmmlu_lite_ZH-CN_accuracy: 65.05
college_naive_average: 20.33
high_naive_average: 47.67
middle_naive_average: 62
primary_naive_average: 72
arithmetic_naive_average: 62.33
mathbench-a (average)_naive_average: 52.87
college_knowledge_naive_average: 70.57
high_knowledge_naive_average: 70.13
middle_knowledge_naive_average: 81.17
primary_knowledge_naive_average: 88.01
mathbench-t (average)_naive_average: 77.47
subjective:
alignment_bench_v1_1_总分: 5.68
alpaca_eval_total: 25.96
arenahard_score: 17.15
Followbench_naive_average: 0.81
CompassArena_naive_average: 34.61
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -15.69
simpleqa_accuracy_given_attempted: 0.04
chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_专业能力: 6.05
alignment_bench_v1_1_数学计算: 5.87
alignment_bench_v1_1_基本任务: 6.01
alignment_bench_v1_1_逻辑推理: 4.48
alignment_bench_v1_1_中文理解: 6.17
alignment_bench_v1_1_文本写作: 6.06
alignment_bench_v1_1_角色扮演: 6.3
alignment_bench_v1_1_综合问答: 6.45
alpaca_eval_helpful_base: 17.83
alpaca_eval_koala: 28.21
alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 25
compassarena_language_naive_average: 52.5
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 29.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73
followbench_llmeval_en_SSR_AVG: 0.88
followbench_llmeval_en_HSR_L1: 0.94
followbench_llmeval_en_HSR_L2: 0.77
followbench_llmeval_en_HSR_L3: 0.73
followbench_llmeval_en_HSR_L4: 0.68
followbench_llmeval_en_HSR_L5: 0.54
followbench_llmeval_en_SSR_L1: 0.94
followbench_llmeval_en_SSR_L2: 0.88
followbench_llmeval_en_SSR_L3: 0.87
followbench_llmeval_en_SSR_L4: 0.87
followbench_llmeval_en_SSR_L5: 0.85
simpleqa_f1: 0.04
internlm2_5-7b-chat-1m-turbomind:
long_context:
ruler_8k_naive_average: 88.53
ruler_32k_naive_average: 83.84
ruler_128k_naive_average: 70.94
NeedleBench-Overall-Score-8K_weighted_average: 91.89
NeedleBench-Overall-Score-32K_weighted_average: 91.42
NeedleBench-Overall-Score-128K_weighted_average: 88.57
longbench_naive_average: 46.44
longbench_zh_naive_average: 45.19
longbench_en_naive_average: 45.71
babilong_0k_naive_average: 79.3
babilong_4k_naive_average: 67
babilong_16k_naive_average: 52.7
babilong_32k_naive_average: 48.9
babilong_128k_naive_average: 40.8
babilong_256k_naive_average: 23.5
longbench_single-document-qa_naive_average: 43.56
longbench_multi-document-qa_naive_average: 46.24
longbench_summarization_naive_average: 24.32
longbench_few-shot-learning_naive_average: 51.67
longbench_synthetic-tasks_naive_average: 66.83
longbench_code-completion_naive_average: 45.99

View File

@ -1,459 +1,468 @@
baichuan2-7b-chat-hf: chat:
gsm8k: 18.75 glm-4-9b-chat-hf:
race-high: 78.12 gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
glm-4-9b-chat-hf: glm-4-9b-chat-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 71.88
race-high: 90.62 race-high_accuracy: 90.62
glm-4-9b-chat-vllm:
glm-4-9b-chat-turbomind: gsm8k_accuracy: 65.62
gsm8k: 75.00 race-high_accuracy: 90.62
race-high: 90.62 deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88
glm-4-9b-chat-vllm: race-high_accuracy: 81.25
gsm8k: 65.62 deepseek-moe-16b-chat-hf:
race-high: 90.62 gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-7b-chat-hf: deepseek-7b-chat-vllm:
gsm8k: 46.88 gsm8k_accuracy: 43.75
race-high: 81.25 race-high_accuracy: 75
gemma2-2b-it-hf:
deepseek-moe-16b-chat-hf: gsm8k_accuracy: 50
gsm8k: 50 race-high_accuracy: 71.88
race-high: 68.75 gemma2-9b-it-hf:
gsm8k_accuracy: 71.88
deepseek-7b-chat-vllm: race-high_accuracy: 84.38
gsm8k: 43.75 gemma-2b-it-hf:
race-high: 75 gsm8k_accuracy: 3.12
race-high_accuracy: 40.62
gemma2-2b-it-hf: gemma-7b-it-hf:
gsm8k: 50 gsm8k_accuracy: 40.62
race-high: 71.88 race-high_accuracy: 68.75
gemma-2-9b-it-turbomind:
gemma2-9b-it-hf: gsm8k_accuracy: 71.88
gsm8k: 71.88 race-high_accuracy: 84.38
race-high: 84.38 gemma-2-27b-it-turbomind:
gsm8k_accuracy: 78.12
gemma-2b-it-hf: race-high_accuracy: 93.75
gsm8k: 3.12 gemma-7b-it-vllm:
race-high: 40.62 gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
gemma-7b-it-hf: internlm2_5-7b-chat-hf:
gsm8k: 40.62 gsm8k_accuracy: 84.38
race-high: 68.75 race-high_accuracy: 90.62
internlm2_5-7b-chat-turbomind:
gemma-2-9b-it-turbomind: gsm8k_accuracy: 87.50
gsm8k: 65.62 race-high_accuracy: 90.62
race-high: 84.38 internlm2-chat-1.8b-turbomind:
gsm8k_accuracy: 28.12
gemma-7b-it-vllm: race-high_accuracy: 84.38
gsm8k: 34.38 internlm2-chat-1.8b-sft-turbomind:
race-high: 68.75 gsm8k_accuracy: 21.88
race-high_accuracy: 84.38
internlm2_5-7b-chat-hf: internlm2-chat-7b-lmdeploy:
gsm8k: 84.38 gsm8k_accuracy: 53.12
race-high: 90.62 race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind:
internlm2_5-7b-chat-turbomind: gsm8k_accuracy: 53.12
gsm8k: 84.38 race-high_accuracy: 90.62
race-high: 90.62 internlm2-chat-7b-vllm:
gsm8k_accuracy: 56.25
internlm2-chat-1.8b-turbomind: race-high_accuracy: 84.38
gsm8k: 25 llama-3_1-8b-instruct-hf:
race-high: 84.38 gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm2-chat-1.8b-sft-turbomind: llama-3_2-3b-instruct-hf:
gsm8k: 21.88 gsm8k_accuracy: 68.75
race-high: 84.38 race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
internlm2-chat-7b-lmdeploy: gsm8k_accuracy: 68.75
gsm8k: 53.12 race-high_accuracy: 87.5
race-high: 84.38 llama-2-7b-chat-turbomind:
gsm8k_accuracy: 18.75
internlm2-chat-7b-sft-turbomind: race-high_accuracy: 46.88
gsm8k: 50 llama-3_1-8b-instruct-turbomind:
race-high: 90.62 gsm8k_accuracy: 78.12
race-high_accuracy: 90.62
internlm2-chat-7b-vllm: llama-3_2-3b-instruct-turbomind:
gsm8k: 43.75 gsm8k_accuracy: 71.88
race-high: 87.5 race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
llama-3_1-8b-instruct-hf: gsm8k_accuracy: 71.88
gsm8k: 84.38 race-high_accuracy: 87.5
race-high: 90.62 mistral-7b-instruct-v0.2-hf:
gsm8k_accuracy: 40.62
llama-3_2-3b-instruct-hf: race-high_accuracy: 75
gsm8k: 65.62 mistral-7b-instruct-v0.3-hf:
race-high: 81.25 gsm8k_accuracy: 40.62
race-high_accuracy: 75
llama-3-8b-instruct-hf: mistral-nemo-instruct-2407-hf:
gsm8k: 68.75 gsm8k_accuracy: 75
race-high: 87.5 race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind:
llama-3_1-8b-instruct-turbomind: gsm8k_accuracy: 65.62
gsm8k: 78.12 race-high_accuracy: 87.50
race-high: 90.62 mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
llama-3_2-3b-instruct-turbomind: race-high_accuracy: 68.75
gsm8k: 62.50 mistral-7b-instruct-v0.2-vllm:
race-high: 81.25 gsm8k_accuracy: 43.75
race-high_accuracy: 75
llama-3-8b-instruct-turbomind: MiniCPM3-4B-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 87.5 race-high_accuracy: 84.38
phi-3-mini-4k-instruct-hf:
mistral-7b-instruct-v0.2-hf: gsm8k_accuracy: 56.25
gsm8k: 40.62 race-high_accuracy: 84.38
race-high: 75 phi-3-small-8k-instruct-hf:
gsm8k_accuracy: 0
mistral-7b-instruct-v0.3-hf: race-high_accuracy: 0
gsm8k: 40.62 qwen2.5-0.5b-instruct-hf:
race-high: 75 gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
mistral-nemo-instruct-2407-hf: qwen2.5-3b-instruct-hf :
gsm8k: 75 gsm8k_accuracy: 53.12
race-high: 81.25 race-high_accuracy: 90.62
qwen2.5-0.5b-instruct-turbomind:
mistral-nemo-instruct-2407-turbomind: gsm8k_accuracy: 28.12
gsm8k: 68.75 race-high_accuracy: 50
race-high: 87.50 qwen2.5-3b-instruct-turbomind:
gsm8k_accuracy: 59.38
mistral-7b-instruct-v0.1-vllm: race-high_accuracy: 90.62
gsm8k: 34.38 qwen1.5-0.5b-chat-hf:
race-high: 68.75 gsm8k_accuracy: 0
race-high_accuracy: 53.12
mistral-7b-instruct-v0.2-vllm: qwen2-1.5b-instruct-hf:
gsm8k: 43.75 gsm8k_accuracy: 62.5
race-high: 75 race-high_accuracy: 84.38
qwen2-7b-instruct-hf:
MiniCPM3-4B-hf: gsm8k_accuracy: 68.75
gsm8k: 68.75 race-high_accuracy: 90.62
race-high: 84.38 qwen2-1.5b-instruct-turbomind:
gsm8k_accuracy: 53.12
minicpm-2b-dpo-fp32-hf: race-high_accuracy: 84.38
gsm8k: 56.25 qwen2-7b-instruct-turbomind:
race-high: 53.12 gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
minicpm-2b-sft-bf16-hf: qwen1.5-0.5b-chat-vllm:
gsm8k: 46.88 gsm8k_accuracy: 3.12
race-high: 65.62 race-high_accuracy: 53.12
yi-1.5-6b-chat-hf:
minicpm-2b-sft-fp32-hf: gsm8k_accuracy: 65.62
gsm8k: 46.88 race-high_accuracy: 84.38
race-high: 65.62 yi-1.5-9b-chat-hf:
gsm8k_accuracy: 75
phi-3-mini-4k-instruct-hf: race-high_accuracy: 93.75
gsm8k: 56.25 yi-1.5-6b-chat-turbomind:
race-high: 84.38 gsm8k_accuracy: 62.5
race-high_accuracy: 84.38
qwen1.5-0.5b-chat-hf: yi-1.5-9b-chat-turbomind:
gsm8k: 0 gsm8k_accuracy: 71.88
race-high: 53.12 race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf:
qwen2-1.5b-instruct-hf: gsm8k_accuracy: 46.88
gsm8k: 62.5 race-high_accuracy: 71.88
race-high: 84.38 gemma2-27b-it-hf:
gsm8k_accuracy: 75
qwen2-7b-instruct-hf: race-high_accuracy: 93.75
gsm8k: 68.75 internlm2_5-20b-chat-hf:
race-high: 90.62 gsm8k_accuracy: 84.38
race-high_accuracy: 87.5
qwen2-1.5b-instruct-turbomind: internlm2_5-20b-chat-turbomind:
gsm8k: 62.50 gsm8k_accuracy: 87.50
race-high: 84.38 race-high_accuracy: 87.5
mistral-small-instruct-2409-hf:
qwen2-7b-instruct-turbomind: gsm8k_accuracy: 81.25
gsm8k: 81.25 race-high_accuracy: 87.50
race-high: 87.5 mistral-small-instruct-2409-turbomind:
gsm8k_accuracy: 81.25
qwen1.5-0.5b-chat-vllm: race-high_accuracy: 87.50
gsm8k: 3.12 qwen2.5-14b-instruct-hf:
race-high: 53.12 gsm8k_accuracy: 71.88
race-high_accuracy: 96.88
yi-1.5-6b-chat-hf: qwen2.5-14b-instruct-turbomind:
gsm8k: 65.62 gsm8k_accuracy: 68.75
race-high: 84.38 race-high_accuracy: 93.75
yi-1.5-34b-chat-turbomind:
yi-1.5-9b-chat-hf: gsm8k_accuracy: 78.12
gsm8k: 75 race-high_accuracy: 93.75
race-high: 93.75 deepseek-67b-chat-hf:
gsm8k_accuracy: 71.88
deepseek-v2-lite-chat-hf: race-high_accuracy: 78.12
gsm8k: 43.75 llama-3_3-70b-instruct-turbomind:
race-high: 71.88 gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
internlm2_5-20b-chat-hf: mixtral-8x7b-instruct-v0.1-hf:
gsm8k: 84.38 gsm8k_accuracy: 56.25
race-high: 87.5 race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
internlm2_5-20b-chat-turbomind: gsm8k_accuracy: 90.62
gsm8k: 84.38 race-high_accuracy: 93.75
race-high: 87.5 nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
gsm8k_accuracy: 87.5
mistral-small-instruct-2409-hf: race-high_accuracy: 46.88
gsm8k: 81.25 qwen2.5-72b-instruct-turbomind:
race-high: 87.50 gsm8k_accuracy: 75
race-high_accuracy: 93.75
mistral-small-instruct-2409-turbomind: deepseek-v2_5-1210-turbomind:
gsm8k: 78.12 gsm8k_accuracy: 90.62
race-high: 87.50 race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-hf:
qwen2.5-14b-instruct-hf: gsm8k_accuracy: 81.25
gsm8k: 71.88 race-high_accuracy: 81.25
race-high: 96.88 base:
glm-4-9b-hf:
qwen2.5-14b-instruct-turbomind: gsm8k_accuracy: 68.75
gsm8k: 71.88 GPQA_diamond_accuracy: 31.25
race-high: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38
glm-4-9b-hf: glm-4-9b-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 62.5
GPQA_diamond: 31.25 GPQA_diamond_accuracy: 28.12
race-high: 93.75 race-high_accuracy: 93.75
winogrande: 84.38 winogrande_accuracy: 84.38
deepseek-7b-base-hf:
deepseek-moe-16b-base-hf: gsm8k_accuracy: 25
gsm8k: 21.88 GPQA_diamond_accuracy: 0
GPQA_diamond: 0 race-high_accuracy: 46.88
race-high: 21.88 winogrande_accuracy: 71.88
winogrande: 65.62 deepseek-moe-16b-base-hf:
gsm8k_accuracy: 21.88
deepseek-7b-base-turbomind: GPQA_diamond_accuracy: 0
gsm8k: 21.88 race-high_accuracy: 21.88
GPQA_diamond: 0 winogrande_accuracy: 65.62
race-high: 46.88 deepseek-7b-base-turbomind:
winogrande: 84.38 gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 0
deepseek-moe-16b-base-vllm: race-high_accuracy: 46.88
gsm8k: 21.88 winogrande_accuracy: 84.38
GPQA_diamond: 0 deepseek-moe-16b-base-vllm:
race-high: 25 gsm8k_accuracy: 21.88
winogrande: 68.75 GPQA_diamond_accuracy: 0
race-high_accuracy: 25
gemma2-2b-hf: winogrande_accuracy: 68.75
gsm8k: 31.25 gemma2-2b-hf:
GPQA_diamond: 3.12 gsm8k_accuracy: 28.12
race-high: 56.25 GPQA_diamond_accuracy: 3.12
winogrande: 71.88 race-high_accuracy: 56.25
winogrande_accuracy: 71.88
gemma2-9b-hf: gemma2-9b-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 81.25 race-high_accuracy: 81.25
winogrande: 84.38 winogrande_accuracy: 84.38
gemma-2b-hf:
gemma-2b-hf: gsm8k_accuracy: 18.75
gsm8k: 18.75 GPQA_diamond_accuracy: 3.12
GPQA_diamond: 3.12 race-high_accuracy: 25
race-high: 25 winogrande_accuracy: 53.12
winogrande: 53.12 gemma-7b-hf:
gsm8k_accuracy: 56.25
gemma-7b-hf: GPQA_diamond_accuracy: 6.25
gsm8k: 56.25 race-high_accuracy: 65.62
GPQA_diamond: 6.25 winogrande_accuracy: 78.12
race-high: 65.62 gemma-2b-vllm:
winogrande: 78.12 gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12
gemma-2b-vllm: race-high_accuracy:
gsm8k: 15.62 winogrande_accuracy:
GPQA_diamond: 6.25 gemma-7b-vllm:
race-high: gsm8k_accuracy: 53.12
winogrande: GPQA_diamond_accuracy: 9.38
race-high_accuracy:
gemma-7b-vllm: winogrande_accuracy:
gsm8k: 53.12 internlm2_5-7b-hf:
GPQA_diamond: 6.25 gsm8k_accuracy: 37.5
race-high: GPQA_diamond_accuracy: 25
winogrande: race-high_accuracy: 93.75
winogrande_accuracy: 71.88
internlm2_5-7b-hf: internlm2-7b-hf:
gsm8k: 37.5 gsm8k_accuracy: 53.12
GPQA_diamond: 25 GPQA_diamond_accuracy: 18.75
race-high: 93.75 race-high_accuracy: 62.5
winogrande: 71.88 winogrande_accuracy: 78.12
internlm2-base-7b-hf:
internlm2-7b-hf: gsm8k_accuracy: 3.12
gsm8k: 53.12 GPQA_diamond_accuracy: 21.88
GPQA_diamond: 18.75 race-high_accuracy: 75
race-high: 62.5 winogrande_accuracy: 65.62
winogrande: 78.12 internlm2-1.8b-turbomind:
gsm8k_accuracy: 12.5
internlm2-base-7b-hf: GPQA_diamond_accuracy: 9.38
gsm8k: 3.12 race-high_accuracy: 71.88
GPQA_diamond: 21.88 winogrande_accuracy: 78.12
race-high: 75 internlm2_5-7b-turbomind:
winogrande: 65.62 gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 34.38
internlm2-1.8b-turbomind: race-high_accuracy: 93.75
gsm8k: 12.5 winogrande_accuracy: 87.50
GPQA_diamond: 12.5 internlm2-7b-turbomind:
race-high: 71.88 gsm8k_accuracy: 53.12
winogrande: 75 GPQA_diamond_accuracy: 21.88
race-high_accuracy: 71.88
internlm2_5-7b-turbomind: winogrande_accuracy: 84.38
gsm8k: 68.75 internlm2-base-7b-turbomind:
GPQA_diamond: 31.25 gsm8k_accuracy: 37.50
race-high: 93.75 GPQA_diamond_accuracy: 28.12
winogrande: 84.38 race-high_accuracy: 81.25
winogrande_accuracy: 75
internlm2-7b-turbomind: llama-2-7b-hf:
gsm8k: 56.25 gsm8k_accuracy: 21.88
GPQA_diamond: 21.88 GPQA_diamond_accuracy: 21.88
race-high: 75 race-high_accuracy: 40.62
winogrande: 81.25 winogrande_accuracy: 71.88
llama-3_1-8b-hf:
internlm2-base-7b-turbomind: gsm8k_accuracy: 78.12
gsm8k: 40.62 GPQA_diamond_accuracy: 25
GPQA_diamond: 28.12 race-high_accuracy: 90.62
race-high: 84.38 winogrande_accuracy: 62.5
winogrande: 71.88 llama-3-8b-hf:
gsm8k_accuracy: 46.88
llama-2-7b-hf: GPQA_diamond_accuracy: 6.25
gsm8k: 21.88 race-high_accuracy: 65.62
GPQA_diamond: 21.88 winogrande_accuracy: 65.62
race-high: 40.62 llama-3.1-8b-turbomind:
winogrande: 71.88 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
llama-3_1-8b-hf: race-high_accuracy: 78.12
gsm8k: 78.12 winogrande_accuracy: 78.12
GPQA_diamond: 25 llama-3-8b-turbomind:
race-high: 90.62 gsm8k_accuracy: 50
winogrande: 62.5 GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62
llama-3-8b-hf: winogrande_accuracy: 78.12
gsm8k: 46.88 mistral-7b-v0.2-hf:
GPQA_diamond: 6.25 gsm8k_accuracy: 31.25
race-high: 65.62 GPQA_diamond_accuracy: 6.25
winogrande: 65.62 race-high_accuracy: 62.5
winogrande_accuracy: 59.38
llama-3.1-8b-turbomind: mistral-7b-v0.3-hf:
gsm8k: 56.25 gsm8k_accuracy: 31.25
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 78.12 race-high_accuracy: 62.5
winogrande: 78.12 winogrande_accuracy: 59.38
mistral-7b-v0.2-vllm:
llama-3-8b-turbomind: gsm8k_accuracy: 34.38
gsm8k: 50 GPQA_diamond_accuracy: 6.25
GPQA_diamond: 9.38 race-high_accuracy: 62.5
race-high: 65.62 winogrande_accuracy: 65.62
winogrande: 78.12 qwen2.5-7b-hf:
gsm8k_accuracy: 81.25
mistral-7b-v0.2-hf: GPQA_diamond_accuracy: 18.75
gsm8k: 31.25 race-high_accuracy: 87.5
GPQA_diamond: 6.25 winogrande_accuracy: 71.88
race-high: 62.5 qwen2.5-1.5b-turbomind:
winogrande: 59.38 gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 12.50
mistral-7b-v0.3-hf: race-high_accuracy: 78.12
gsm8k: 31.25 winogrande_accuracy: 68.75
GPQA_diamond: 6.25 qwen2.5-7b-turbomind:
race-high: 62.5 gsm8k_accuracy: 75.00
winogrande: 59.38 GPQA_diamond_accuracy: 25
race-high_accuracy: 87.5
mistral-7b-v0.2-vllm: winogrande_accuracy: 71.88
gsm8k: 34.38 qwen1.5-moe-a2.7b-hf:
GPQA_diamond: 6.25 gsm8k_accuracy: 62.5
race-high: 62.5 GPQA_diamond_accuracy: 18.75
winogrande: 65.62 race-high_accuracy: 84.38
winogrande_accuracy: 75
qwen2.5-7b-hf: qwen2-0.5b-hf:
gsm8k: 81.25 gsm8k_accuracy: 25
GPQA_diamond: 18.75 GPQA_diamond_accuracy: 0
race-high: 87.5 race-high_accuracy: 40.62
winogrande: 71.88 winogrande_accuracy: 62.5
qwen2-1.5b-hf:
qwen2.5-1.5b-turbomind: gsm8k_accuracy: 59.38
gsm8k: 71.88 GPQA_diamond_accuracy: 9.38
GPQA_diamond: 15.62 race-high_accuracy: 81.25
race-high: 78.12 winogrande_accuracy: 62.5
winogrande: 71.88 qwen2-7b-hf:
gsm8k_accuracy: 68.75
qwen2.5-7b-turbomind: GPQA_diamond_accuracy: 9.38
gsm8k: 71.88 race-high_accuracy: 87.5
GPQA_diamond: 25 winogrande_accuracy: 68.75
race-high: 87.5 qwen2-1.5b-turbomind:
winogrande: 71.88 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
qwen1.5-moe-a2.7b-hf: race-high_accuracy: 81.25
gsm8k: 62.5 winogrande_accuracy: 75
GPQA_diamond: 18.75 qwen2-7b-turbomind:
race-high: 84.38 gsm8k_accuracy: 75.00
winogrande: 75 GPQA_diamond_accuracy: 12.5
race-high_accuracy: 87.5
qwen2-0.5b-hf: winogrande_accuracy: 71.88
gsm8k: 25 qwen1.5-0.5b-vllm:
GPQA_diamond: 0 gsm8k_accuracy: 9.38
race-high: 40.62 GPQA_diamond_accuracy: 0
winogrande: 62.5 race-high_accuracy: 56.25
winogrande_accuracy: 62.5
qwen2-1.5b-hf: yi-1.5-6b-hf:
gsm8k: 59.38 gsm8k_accuracy: 62.5
GPQA_diamond: 9.38 GPQA_diamond_accuracy: 3.12
race-high: 81.25 race-high_accuracy: 87.5
winogrande: 62.5 winogrande_accuracy: 62.5
yi-1.5-9b-hf:
qwen2-7b-hf: gsm8k_accuracy: 75
gsm8k: 68.75 GPQA_diamond_accuracy: 40.62
GPQA_diamond: 9.38 race-high_accuracy: 87.5
race-high: 87.5 winogrande_accuracy: 59.38
winogrande: 68.75 yi-1.5-9b-turbomind:
gsm8k_accuracy: 78.12
qwen2-1.5b-turbomind: GPQA_diamond_accuracy: 40.62
gsm8k: 62.50 race-high_accuracy: 87.5
GPQA_diamond: 6.25 winogrande_accuracy: 71.88
race-high: 81.25 deepseek-v2-lite-hf:
winogrande: 75 gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 28.12
qwen2-7b-turbomind: race-high_accuracy: 59.38
gsm8k: 68.75 winogrande_accuracy: 71.88
GPQA_diamond: 12.5 internlm2-20b-hf:
race-high: 87.5 gsm8k_accuracy: 56.25
winogrande: 71.88 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 68.75
qwen1.5-0.5b-vllm: winogrande_accuracy: 75
gsm8k: 9.38 internlm2-base-20b-hf:
GPQA_diamond: 0 gsm8k_accuracy: 12.5
race-high: 56.25 GPQA_diamond_accuracy: 9.38
winogrande: 62.5 race-high_accuracy: 84.38
winogrande_accuracy: 65.62
yi-1.5-6b-hf: internlm2-20b-turbomind:
gsm8k: 62.5 gsm8k_accuracy: 71.88
GPQA_diamond: 3.12 GPQA_diamond_accuracy: 15.62
race-high: 87.5 race-high_accuracy: 68.75
winogrande: 62.5 winogrande_accuracy: 81.25
qwen2.5-14b-hf:
yi-1.5-9b-hf: gsm8k_accuracy: 75
gsm8k: 75 GPQA_diamond_accuracy: 37.5
GPQA_diamond: 40.62 race-high_accuracy: 93.75
race-high: 87.5 winogrande_accuracy: 84.38
winogrande: 59.38 qwen2.5-32b-hf:
gsm8k_accuracy: 87.5
deepseek-v2-lite-hf: GPQA_diamond_accuracy: 31.25
gsm8k: 28.12 race-high_accuracy: 93.75
GPQA_diamond: 21.88 winogrande_accuracy: 78.12
race-high: 59.38 qwen2.5-32b-turbomind:
winogrande: 75 gsm8k_accuracy: 84.38
GPQA_diamond_accuracy: 28.12
internlm2-20b-hf: race-high_accuracy: 93.75
gsm8k: 56.25 winogrande_accuracy: 81.25
GPQA_diamond: 15.62 deepseek-67b-base-hf:
race-high: 68.75 gsm8k_accuracy: 59.38
winogrande: 75 GPQA_diamond_accuracy: 31.25
race-high_accuracy: 81.25
internlm2-base-20b-hf: winogrande_accuracy: 90.62
gsm8k: 12.5 deepseek-67b-base-turbomind:
GPQA_diamond: 9.38 gsm8k_accuracy: 56.25
race-high: 84.38 GPQA_diamond_accuracy: 28.12
winogrande: 65.62 race-high_accuracy: 81.25
winogrande_accuracy: 84.38
internlm2-20b-turbomind: llama-3-70b-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 59.38
GPQA_diamond: 15.62 GPQA_diamond_accuracy: 9.38
race-high: 68.75 race-high_accuracy: 93.75
winogrande: 81.25 winogrande_accuracy: 84.38
qwen2.5-72b-turbomind:
qwen2.5-14b-hf: gsm8k_accuracy: 84.38
gsm8k: 75 GPQA_diamond_accuracy: 34.38
GPQA_diamond: 37.5 race-high_accuracy: 93.75
race-high: 93.75 winogrande_accuracy: 87.5
winogrande: 84.38 deepseek-v2-turbomind:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 81.25
winogrande_accuracy: 68.75
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38

View File

@ -28,39 +28,39 @@ on:
description: 'Set branch or tag or commit id. Default is "main"' description: 'Set branch or tag or commit id. Default is "main"'
type: string type: string
default: 'main' default: 'main'
regression_func: regression_func_volc:
required: true required: true
description: 'regression functions' description: 'regression functions'
type: string type: string
default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']" default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
cuda_env: regression_func_local:
required: true required: true
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" description: 'regression functions'
type: string type: string
default: "['dsw_cu12']" default: "['cmd', 'api', 'chat_sub_fullbench']"
fullbench_eval:
required: true
description: 'fullbench volc functions'
type: string
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
schedule: schedule:
- cron: '15 16 * * *' - cron: '15 14 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env: env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
HF_DATASETS_OFFLINE: 1 HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1 HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
CONDA_ENV: regression_test
jobs: jobs:
build-pypi: build-pypi:
@ -124,11 +124,7 @@ jobs:
prepare_env: prepare_env:
if: ${{!cancelled()}} if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy'] needs: ['build-pypi', 'build-pypi-lmdeploy']
strategy: runs-on: volc_cu12
fail-fast: false
matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
runs-on: ${{ matrix.cuda_env }}
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -144,71 +140,52 @@ jobs:
- name: Remove Conda Env - name: Remove Conda Env
if: always() if: always()
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda env remove -y --name ${{env.CONDA_ENV}}
conda info --envs conda info --envs
- name: Prepare - create conda env and install torch - cu11
if: ${{matrix.cuda_env == 'dsw_cu11'}}
uses: nick-fields/retry@v3
id: retry1
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12 - name: Prepare - create conda env and install torch - cu12
if: ${{matrix.cuda_env == 'dsw_cu12'}}
uses: nick-fields/retry@v3 uses: nick-fields/retry@v3
id: retry2
with: with:
max_attempts: 3 max_attempts: 1
timeout_minutes: 40 timeout_minutes: 240
command: | command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
pip list
- name: Prepare - reinstall lmdeploy - cu12 - name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} if: ${{inputs.build_lmdeploy}}
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:
name: my-artifact-${{ github.run_id }}-py310 name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12 - name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} if: ${{inputs.build_lmdeploy}}
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
pip install lmdeploy-*.whl --no-deps pip install lmdeploy-*.whl --no-deps
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
daily_run_test: daily_run_test_volc:
if: ${{!cancelled()}} if: ${{!cancelled()}}
needs: prepare_env needs: prepare_env
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} runs-on: volc_cu12_daily
runs-on: ${{ matrix.cuda_env }}
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -217,97 +194,147 @@ jobs:
with: with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}} ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Prepare - prepare data and hf model - name: conda env
run: | run: |
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p . ${{env.CONDA_PATH}}/bin/activate
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: modify config
if: matrix.regression_func != 'chat_sub_fullbench'
run: |
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
- name: Run test
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 120
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
daily_run_test_local:
if: ${{!cancelled()}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: modify config
if: matrix.regression_func == 'chat_sub_fullbench'
run: |
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
- name: Run command testcase - name: Run command testcase
if: matrix.regression_func == 'cmd' if: matrix.regression_func == 'cmd'
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
export from_tf=TRUE export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test
if: matrix.regression_func == 'chat_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
if: matrix.regression_func == 'base_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_obj_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test - fullbench
if: matrix.regression_func == 'base_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api - name: Run model test - api
if: matrix.regression_func == 'api' if: matrix.regression_func == 'api'
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV" echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s sleep 120s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api kill - name: Run model test - api kill
if: always() && matrix.regression_func == 'api' if: always() && matrix.regression_func == 'api'
run: | run: |
kill -15 "$restful_pid" kill -15 "$restful_pid"
- name: Run testcase
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
export from_tf=TRUE
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
fullbench_run_test:
if: ${{!cancelled()}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 360 #6hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: Run testcase
uses: nick-fields/retry@v3
with:
max_attempts: 1
timeout_minutes: 240
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
export from_tf=TRUE
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test] needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
environment: 'prod' environment: 'prod'
timeout-minutes: 5 timeout-minutes: 5
runs-on: self-hosted runs-on: self-hosted

View File

@ -18,18 +18,23 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
env: env:
CONDA_ENV: opencompass_ CONDA_ENV: pr_test
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_DATASETS_OFFLINE: 1 HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
VLLM_USE_MODELSCOPE: false VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
jobs: jobs:
pr_run_test: pr_run_test:
runs-on: self-hosted runs-on: volc_cu12_local
environment: 'prod' environment: 'prod'
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
@ -37,54 +42,55 @@ jobs:
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Prepare - Install opencompass - name: Prepare - Install opencompass
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs conda info --envs
- name: Prepare - prepare data and hf model - name: conda env
run: | run: |
cp -r ${{env.USERSPACE_PREFIX}}/data . . ${{env.CONDA_PATH}}/bin/activate
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p conda activate ${{env.CONDA_ENV}}
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub conda info --envs
pip list
lmdeploy check_env
- name: Run test - name: Run test
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
rm -rf regression_result rm -rf regression_result
opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2 opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2 opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
- name: Get result - name: Get result
run: | run: |
score=$(sed -n '$p' regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
echo "score is $score between 88 and 89" echo "score is $score between 88 and 89"
else else
echo "score is $score not between 88 and 89" echo "score is $score not between 88 and 89"
exit 1 exit 1
fi fi
score=$(sed -n '$p' regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
echo "score is $score between 87 and 88" echo "score is $score between 87 and 88"
else else
echo "score is $score not between 87 and 88" echo "score is $score not between 87 and 88"
exit 1 exit 1
fi fi
score=$(sed -n '$p' regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
echo "score is $score between 87 and 89" echo "score is $score between 87 and 91"
else else
echo "score is $score not between 87 and 89" echo "score is $score not between 87 and 91"
exit 1 exit 1
fi fi
rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3
- name: Uninstall opencompass - name: Uninstall opencompass
if: always() if: always()
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y python3 -m pip uninstall opencompass -y
conda info --envs conda info --envs

View File

@ -1,21 +1,26 @@
name: deploy name: deploy
on: push on:
push:
concurrency: workflow_dispatch:
group: ${{ github.workflow }}-${{ github.ref }} inputs:
cancel-in-progress: true confirm_publish:
description: 'Type YES to confirm publishing to PyPI'
required: true
type: string
jobs: jobs:
build-n-publish: build-n-publish:
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: startsWith(github.event.ref, 'refs/tags') if: |
github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') ||
(github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES')
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Set up Python 3.7 - name: Set up Python 3.10
uses: actions/setup-python@v1 uses: actions/setup-python@v4
with: with:
python-version: 3.7 python-version: '3.10'
- name: Build lagent - name: Build lagent
run: | run: |
pip install wheel pip install wheel

View File

@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`. We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
<p align="right"><a href="#top">🔝Back to top</a></p> <p align="right"><a href="#top">🔝Back to top</a></p>
## 🛠️ Installation ## 🛠️ Installation

View File

@ -77,6 +77,8 @@
我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`. 我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.
你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py),快速地复现榜单的结果,目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
<p align="right"><a href="#top">🔝返回顶部</a></p> <p align="right"><a href="#top">🔝返回顶部</a></p>
## 🛠️ 安装指南 ## 🛠️ 安装指南

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 128] max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k'] abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 16] max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k'] abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 1024] max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m'] abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 32] max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k'] abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 4] max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k'] abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 64] max_seq_lens = [1024 * 64]
abbr_suffixs: list[str] = ['64k'] abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 8] max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k'] abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,9 +1,7 @@
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
# Ruler Dataset settings # Ruler Dataset settings
niah_configurations = [ niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
'type': RulerNiahDataset, 'type': RulerNiahDataset,
'base_path': base_path, 'base_path': base_path,
'file_path': file_path, 'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128, 'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'], 'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'], 'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'], 'type_needle_v': config['type_needle_v'],

View File

@ -10,12 +10,10 @@ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
####################################################################### #######################################################################
with read_base(): with read_base():
# Datasets Part # Datasets Part
## Core Set
# Knowledge # Knowledge
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import ( from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
mmlu_pro_datasets, mmlu_pro_datasets,
) )
# General Reasoning # General Reasoning
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import ( from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
gpqa_datasets, gpqa_datasets,
@ -23,22 +21,19 @@ with read_base():
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import ( from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
bbh_datasets, bbh_datasets,
) )
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import ( from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
humaneval_datasets, humaneval_datasets,
) )
# Instruction Following # Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ( from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
ifeval_datasets, ifeval_datasets,
) )
from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import ( from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
LCBCodeGeneration_dataset, LCBCodeGeneration_dataset,
) )
# Math # Math
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
cmo_fib_datasets,
)
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import ( from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
aime2024_datasets, aime2024_datasets,
) )
@ -77,7 +72,6 @@ core_summary_groups = [
['IFEval', 'Prompt-level-strict-accuracy'], ['IFEval', 'Prompt-level-strict-accuracy'],
['bbh', 'naive_average'], ['bbh', 'naive_average'],
['math_prm800k_500', 'accuracy'], ['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'], ['aime2024', 'accuracy'],
['GPQA_diamond', 'accuracy'], ['GPQA_diamond', 'accuracy'],
['mmlu_pro', 'naive_average'], ['mmlu_pro', 'naive_average'],
@ -101,7 +95,6 @@ summarizer = dict(
'', '',
'Math Calculation', 'Math Calculation',
['math_prm800k_500', 'accuracy'], ['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'], ['aime2024', 'accuracy'],
'', '',
'Knowledge', 'Knowledge',

View File

@ -0,0 +1,164 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
from opencompass.datasets.livecodebench import TestOutputPromptConstants
lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'
# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
),
pred_role='BOT',
)
LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg
)
# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)
LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)
# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)
LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)
LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
]

View File

@ -23,7 +23,7 @@ math_infer_cfg = dict(
), ),
), ),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024), inferencer=dict(type=GenInferencer),
) )
# postprocess v2 # postprocess v2

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 128] max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k'] abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 16] max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k'] abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 1024] max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m'] abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 32] max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k'] abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 4] max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k'] abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 64] max_seq_lens = [1024 * 64]
abbr_suffixs: list[str] = ['64k'] abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 8] max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k'] abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,9 +1,7 @@
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
# Ruler Dataset settings # Ruler Dataset settings
niah_configurations = [ niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
'type': RulerNiahDataset, 'type': RulerNiahDataset,
'base_path': base_path, 'base_path': base_path,
'file_path': file_path, 'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128, 'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'], 'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'], 'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'], 'type_needle_v': config['type_needle_v'],

View File

@ -163,6 +163,8 @@ class BigCodeBenchEvaluator(BaseEvaluator):
logger.info('Read timeout error. Retrying in 4s...') logger.info('Read timeout error. Retrying in 4s...')
time.sleep(4) time.sleep(4)
if 'pass@1' in pass_at_k.keys():
pass_at_k['pass@1'] *= 100
dump_results = {'details': results} dump_results = {'details': results}
dump_results.update(pass_at_k) dump_results.update(pass_at_k)

View File

@ -147,7 +147,7 @@ class OpenAI(BaseAPIModel):
self.path = path self.path = path
self.max_completion_tokens = max_completion_tokens self.max_completion_tokens = max_completion_tokens
self.logger.warning( self.logger.warning(
f'Max Completion tokens for {path} is :{max_completion_tokens}') f'Max Completion tokens for {path} is {max_completion_tokens}')
def generate(self, def generate(self,
inputs: List[PromptType], inputs: List[PromptType],
@ -278,7 +278,7 @@ class OpenAI(BaseAPIModel):
self.logger.warning( self.logger.warning(
f"'max_token' is unsupported for model {self.path}") f"'max_token' is unsupported for model {self.path}")
self.logger.warning( self.logger.warning(
f'We use max_completion_tokens:' f'We use max_completion_tokens: '
f'{self.max_completion_tokens}for this query') f'{self.max_completion_tokens}for this query')
data = dict( data = dict(
model=self.path, model=self.path,
@ -588,13 +588,12 @@ class OpenAISDK(OpenAI):
self.logger.warning( self.logger.warning(
f"'max_token' is unsupported for model {self.path}") f"'max_token' is unsupported for model {self.path}")
self.logger.warning( self.logger.warning(
f'We use max_completion_tokens:' f'We use max_completion_tokens: '
f'{self.max_completion_tokens}for this query') f'{self.max_completion_tokens}for this query')
query_data = dict( query_data = dict(
model=self.path, model=self.path,
max_completion_tokens=self.max_completion_tokens, max_completion_tokens=self.max_completion_tokens,
n=1, n=1,
temperature=self.temperature,
messages=messages, messages=messages,
extra_body=self.extra_body, extra_body=self.extra_body,
) )
@ -636,8 +635,8 @@ class OpenAISDK(OpenAI):
if (status_code is not None if (status_code is not None
and status_code in self.status_code_mappings): and status_code in self.status_code_mappings):
error_message = self.status_code_mappings[status_code] error_message = self.status_code_mappings[status_code]
self.logger.info(f'Status Code: {status_code},\n' self.logger.info(f'Status Code: {status_code}, \n'
f'Original Error Message: {e},\n' f'Original Error Message: {e}, \n'
f'Return Message: {error_message} ') f'Return Message: {error_message} ')
return error_message return error_message
else: else:

View File

@ -335,7 +335,7 @@ class DLCRunner(BaseRunner):
pass pass
# Lark Report when failed # Lark Report when failed
if return_code == -1: if return_code == -1 and self.lark_reporter is not None:
content = f'DLC job failed. Task name: {task_name}' content = f'DLC job failed. Task name: {task_name}'
self.lark_reporter.post(title='DLC job failed', content=content) self.lark_reporter.post(title='DLC job failed', content=content)

View File

@ -207,9 +207,14 @@ class LocalRunner(BaseRunner):
task_name = task.name task_name = task.name
pwd = os.getcwd()
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_{index}_params.py' # Using uuid to avoid filename conflict
import uuid
uuid_str = str(uuid.uuid4())
param_file = f'{pwd}/tmp/{uuid_str}_params.py'
try: try:
task.cfg.dump(param_file) task.cfg.dump(param_file)
tmpl = get_command_template(gpu_ids) tmpl = get_command_template(gpu_ids)
@ -236,5 +241,8 @@ class LocalRunner(BaseRunner):
logger.error(f'task {task_name} fail, see\n{out_path}') logger.error(f'task {task_name} fail, see\n{out_path}')
finally: finally:
# Clean up # Clean up
os.remove(param_file) if not self.keep_tmp_file:
os.remove(param_file)
else:
pass
return task_name, result.returncode return task_name, result.returncode

View File

@ -227,20 +227,20 @@ class VOLCRunner(BaseRunner):
task_status = os.popen(ask_cmd).read() task_status = os.popen(ask_cmd).read()
pattern = r'(?<=\[{"Status":").*(?="}\])' pattern = r'(?<=\[{"Status":").*(?="}\])'
match = re.search(pattern, task_status) match = re.search(pattern, task_status)
if match:
task_status = match.group()
else:
task_status = 'Exception'
if self.debug: if self.debug:
print(task_status) print(task_status)
logs = os.popen(log_cmd).read() logs = os.popen(log_cmd).read()
with open(log_path, 'w', encoding='utf-8') as f: with open(log_path, 'w', encoding='utf-8') as f:
f.write(logs) f.write(logs)
if task_status in [ if match:
'Success', 'Failed', 'Cancelled', 'Exception', task_status = match.group()
'Killing', 'SuccessHolding', 'FailedHolding' if task_status in [
]: 'Success', 'Failed', 'Cancelled', 'Exception',
break 'Killing', 'SuccessHolding', 'FailedHolding',
'Killed'
]:
break
# If pattern not found or command failed, sleep and retry
time.sleep(poll_interval) time.sleep(poll_interval)
else: else:
task_status = 'Exception' task_status = 'Exception'

View File

@ -4,6 +4,7 @@ import functools
import getpass import getpass
import math import math
import os.path as osp import os.path as osp
from collections import OrderedDict
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@ -110,6 +111,7 @@ class DefaultSubjectiveSummarizer:
if not osp.exists(filepath): if not osp.exists(filepath):
continue continue
result = mmengine.load(filepath) result = mmengine.load(filepath)
result = OrderedDict(sorted(result.items()))
result.pop('details', None) result.pop('details', None)
if idx == 0: if idx == 0:
raw_results[model_abbr][dataset_abbr] = result raw_results[model_abbr][dataset_abbr] = result

View File

@ -2,7 +2,7 @@ absl-py
accelerate>=0.19.0 accelerate>=0.19.0
cpm_kernels cpm_kernels
datasets>=2.12.0 datasets>=2.12.0
einops==0.5.0 einops>=0.5.0
evaluate>=0.3.0 evaluate>=0.3.0
func_timeout func_timeout
fuzzywuzzy fuzzywuzzy
@ -16,7 +16,7 @@ jieba
json5 json5
jsonlines jsonlines
mmengine-lite mmengine-lite
nltk==3.8 nltk>=3.7
numpy>=1.23.4,<2.0.0 numpy>=1.23.4,<2.0.0
openai openai
OpenCC OpenCC