[ci] react daily test (#1668)

* updaste

* update

* update

* update

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* refactor summarize

* update

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* update

* updaste

* updaste

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* Update daily-run-test.yml

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* Update daily-run-test.yml

* update

* update

* Update daily-run-test.yml

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
This commit is contained in:
zhulinJulia24 2024-11-12 18:40:27 +08:00 committed by GitHub
parent 3ec178f4a9
commit a9d6b6461f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 1710 additions and 521 deletions

39
.github/scripts/eval_regression_api.py vendored Normal file
View File

@ -0,0 +1,39 @@
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://localhost:23333/v1',
path='internlm2',
tokenizer_path='internlm/internlm2_5-7b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=20,
)
]

View File

@ -2,15 +2,21 @@ from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_ppl import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
@ -19,34 +25,58 @@ with read_base():
models as hf_gemma2_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b import \
models as hf_gemma2_9b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b import \
models as hf_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_2b import \
models as vllm_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b import \
models as vllm_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
models as hf_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
models as hf_qwen_2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
models as hf_qwen_2_5_14b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -65,11 +95,27 @@ with read_base():
models as hf_yi_1_5_6b_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'
d['reader_cfg']['test_range'] = '[0:32]'
for m in models:
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
summarizer = dict(
dataset_abbrs=[
['gsm8k', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,184 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
gpqa_datasets # noqa: F401, E501
# Corebench v1.7
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_few_shot_ppl import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
race_datasets = [race_datasets[1]] # Only take RACE-High
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
summarizer = dict(
dataset_abbrs=[
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['drop', 'accuracy'],
['bbh', 'naive_average'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
['mmlu-other', 'accuracy'],
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
['cmmlu-china-specific', 'accuracy'],
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])

View File

@ -1,7 +1,5 @@
from mmengine.config import read_base
from opencompass.models import OpenAISDK
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
@ -29,6 +27,12 @@ with read_base():
models as hf_gemma2_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
models as hf_gemma2_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
models as vllm_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -51,18 +55,35 @@ with read_base():
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
models as hf_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
models as \
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
models as hf_minicpm3_4b_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -73,6 +94,10 @@ with read_base():
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -89,10 +114,8 @@ with read_base():
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
api_meta_template = dict(
@ -103,25 +126,24 @@ api_meta_template = dict(
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
model_name = ''
models.append(
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://judgemodel:10001/v1',
path='compass_judger_internlm2_102b_0508',
tokenizer_path='internlm/internlm2_5-20b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=50,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=3,
))
for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'
d['reader_cfg']['test_range'] = '[0:32]'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
summarizer = dict(
dataset_abbrs=[
'gsm8k',
'race-middle',
'race-high',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,246 @@
from mmengine.config import read_base
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
ds1000_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
# new datasets in Fullbench v1.1
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
SciCode_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
teval_datasets as teval_en_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
teval_datasets as teval_zh_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.ds1000 import \
ds1000_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import \
humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.scicode import \
scicode_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.teval import \
teval_summary_groups # noqa: F401, E501
# For HumanEval-X Evaluation
# Apply the evaluator ip_address and port
race_datasets = [race_datasets[1]]
for item in humanevalx_datasets:
item['eval_cfg']['evaluator'][
'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
item['eval_cfg']['evaluator']['port'] = ''
# For DS-1000 Evaluation
# Apply the evaluator ip_address and port
for item in ds1000_datasets:
item['eval_cfg']['evaluator'][
'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
item['eval_cfg']['evaluator']['port'] = ''
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')
and 'scicode' not in k.lower() and 'teval' not in k),
[],
)
datasets += teval_en_datasets
datasets += teval_zh_datasets
# datasets += SciCode_datasets
summarizer = dict(
dataset_abbrs=[
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'],
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['math', 'accuracy'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['teval', 'naive_average'],
['SciCode', 'accuracy'],
['SciCode', 'sub_accuracy'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
['IFEval', 'Prompt-level-strict-accuracy'],
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
''
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'',
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
'',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-go',
'humanevalx-java',
'humanevalx-js',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])

View File

@ -0,0 +1,70 @@
from copy import deepcopy
from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
judge_models = deepcopy([models[1]])
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)

View File

@ -7,36 +7,56 @@ import yaml
output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
'lmdeploy-api-test'
'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
'qwen2.5-14b-instruct-turbomind'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']
@pytest.fixture()
def baseline_scores_testrange(request):
config_path = os.path.join(
request.config.rootdir,
'.github/scripts/oc_score_baseline_testrange.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
return config
@pytest.fixture()
@ -48,6 +68,16 @@ def baseline_scores(request):
return config
@pytest.fixture()
def baseline_scores_fullbench(request):
config_path = os.path.join(
request.config.rootdir,
'.github/scripts/oc_score_baseline_fullbench.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
return config
@pytest.fixture()
def result_scores():
file = find_csv_files(output_path)
@ -57,100 +87,228 @@ def result_scores():
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat
class TestChat:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2)
for p1 in chat_model_list
for p2 in dataset_list])
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
base_score = baseline_scores.get(model).get(dataset)
@pytest.mark.parametrize('model, dataset',
[(p1, p2) for p1 in chat_model_list
for p2 in ['gsm8k', 'race-high']])
def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset):
base_score = baseline_scores_testrange.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base
class TestBase:
"""Test cases for base model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2)
for p1 in base_model_list
for p2 in dataset_list])
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in base_model_list
for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset):
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
return
base_score = baseline_scores.get(model).get(dataset)
base_score = baseline_scores_testrange.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_obj_fullbench
class TestChatObjFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
'TheoremQA', 'college', 'college_knowledge',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
'ds1000_Pytorch', 'ds1000_Matplotlib'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_sub_fullbench
class TestChatSubFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [
'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
'CompassArenacompassarena_knowledge',
'CompassArenacompassarena_reason_v2',
'CompassArenacompassarena_math_v2',
'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
'MTBench101average', 'Wildbenchscore'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.base_fullbench
class TestBaseFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
'TheoremQA', 'college', 'college_knowledge',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.api
class TestApibench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset',
[('lmdeploy-api-test', 'race-middle'),
('lmdeploy-api-test', 'race-high'),
('lmdeploy-api-test', 'gsm8k')])
def test_api(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
class TestCmdCase:
@pytest.mark.case1
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'),
('internlm2_5-7b-hf', 'race-high')])
def test_cmd_case1(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-hf', 'race-high'),
('internlm2_5-7b-hf', 'demo_gsm8k'),
('internlm2-1.8b-hf', 'race-middle'),
('internlm2-1.8b-hf', 'race-high'),
('internlm2-1.8b-hf', 'demo_gsm8k')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
@pytest.mark.case2
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
('internlm2_5-7b-chat-lmdeploy', 'race-high')])
def test_cmd_case2(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-chat-lmdeploy', 'race-high'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
('internlm2-chat-1.8b-lmdeploy', 'race-high'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.case3
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'),
('internlm2_5-7b_hf', 'race-high')])
def test_cmd_case3(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b_hf', 'race-high'),
('internlm2_5-7b_hf', 'demo_gsm8k')])
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
@pytest.mark.case4
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat_hf', 'race-middle'),
('internlm2_5-7b-chat_hf', 'race-high')])
def test_cmd_case4(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-chat_hf', 'race-high'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
def assert_score(score, baseline):
THRESHOLD = 3
def assert_score(model_type, score, baseline):
if score is None or score == '-':
assert False, 'value is none'
if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
print(score + ' between ' + str(baseline - 5) + ' and ' +
str(baseline + 5))
assert True
if 'batch' not in model_type:
if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
0.01):
print(' '.join([score, 'is equal', str(baseline)]))
assert True
else:
print(' '.join([score, 'is not equal', str(baseline)]))
assert False, ' '.join([score, 'is not equal', str(baseline)])
else:
assert False, score + ' not between ' + str(
baseline - 5) + ' and ' + str(baseline + 5)
if float(score) <= (baseline + THRESHOLD) and float(score) >= (
baseline - THRESHOLD):
print(' '.join([
score, 'is between',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
]))
assert True
else:
print(' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
]))
assert False, ' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
])
def find_csv_files(directory):
csv_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.csv'):
if file.endswith('.csv') and (file.startswith('summary') or
file.startswith('Subjective_all')):
csv_files.append(os.path.join(root, file))
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -163,14 +321,24 @@ def read_csv_file(file_path):
with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
filtered_data = []
for row in reader:
filtered_row = {
k: v
for k, v in row.items()
if k not in ['version', 'metric', 'mode']
}
filtered_data.append(filtered_row)
if 'Subjective_all' not in file_path:
for row in reader:
if row['metric'] is not None and 'bpb' not in row['metric']:
filtered_row = {
k: v
for k, v in row.items()
if k not in ['version', 'metric', 'mode']
}
filtered_data.append(filtered_row)
else:
for row in reader:
if row['Detailed Scores'] is not None:
filtered_row = row
filtered_row['dataset'] = filtered_row[
'Dataset'] + filtered_row['Detailed Scores']
del filtered_row['Dataset']
del filtered_row['Detailed Scores']
filtered_data.append(filtered_row)
result = {}
for data in filtered_data:

View File

@ -1,369 +1,34 @@
baichuan2-7b-chat-hf:
gsm8k: 30
race-middle: 74
race-high: 79
internlm2_5-7b-hf:
demo_gsm8k: 42.19
race-middle: 91.78
race-high: 90.02
glm-4-9b-chat-hf:
gsm8k: 75
race-middle: 88
race-high: 88
internlm2_5-7b_hf:
demo_gsm8k: 42.19
race-middle: 91.78
race-high: 90.02
glm-4-9b-chat-turbomind:
gsm8k: 69
race-middle: 82
race-high: 77
internlm2-1.8b-hf:
demo_gsm8k: 15.62
race-middle: 71.66
race-high: 66.38
glm-4-9b-chat-vllm:
gsm8k: 73
race-middle: 87
race-high: 87
internlm2_5-7b-chat-lmdeploy:
demo_gsm8k: 84.38
race-middle: 92.76
race-high: 90.54
deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
race-high: 80
internlm2-chat-1.8b-lmdeploy:
demo_gsm8k: 31
race-middle: 81.34
race-high: 73.96
deepseek-moe-16b-chat-hf:
gsm8k: 62
race-middle: 62
race-high: 70
deepseek-v2-lite-chat-hf:
gsm8k: 59
race-middle: 82
race-high: 79
deepseek-7b-chat-vllm:
gsm8k: 63
race-middle: 74
race-high: 79
gemma-2b-it-hf:
gsm8k: 14
race-middle: 62
race-high: 52
gemma-7b-it-hf:
gsm8k: 39
race-middle: 74
race-high: 71
gemma-7b-it-vllm:
gsm8k: 38
race-middle: 75
race-high: 70
gemma2-2b-it-hf:
gsm8k: 62
race-middle: 75
race-high: 67
gemma2-9b-it-hf:
gsm8k: 80
race-middle: 89
race-high: 85
internlm2_5-7b-chat-hf:
gsm8k: 86
race-middle: 92
race-high: 93
internlm2_5-20b-chat-hf:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2_5-7b-chat-turbomind:
gsm8k: 87
race-middle: 92
race-high: 93
internlm2_5-20b-chat-turbomind:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
race-high: 83
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 34
race-middle: 81
race-high: 83
internlm2-chat-7b-lmdeploy:
gsm8k: 69
race-middle: 90
race-high: 88
internlm2-chat-7b-sft-turbomind:
gsm8k: 71
race-middle: 91
race-high: 92
internlm2-chat-7b-vllm:
gsm8k: 63
race-middle: 90
race-high: 91
llama-3_1-8b-instruct-hf:
gsm8k: 82
race-middle: 82
race-high: 88
llama-3-8b-instruct-hf:
gsm8k: 77
race-middle: 85
race-high: 87
llama-3_1-8b-instruct-turbomind:
gsm8k: 79
race-middle: 82
race-high: 88
llama-3-8b-instruct-turbomind:
gsm8k: 77
race-middle: 85
race-high: 89
mistral-7b-instruct-v0.2-hf:
gsm8k: 48
race-middle: 82
race-high: 78
mistral-7b-instruct-v0.3-hf:
gsm8k: 53
race-middle: 80
race-high: 78
mistral-7b-instruct-v0.2-vllm:
gsm8k: 49
race-middle: 81
race-high: 77
minicpm-2b-dpo-fp32-hf:
gsm8k: 58
race-middle: 66
race-high: 74
minicpm-2b-sft-bf16-hf:
gsm8k: 58
race-middle: 75
race-high: 81
minicpm-2b-sft-fp32-hf:
gsm8k: 58
race-middle: 75
race-high: 81
phi-3-mini-4k-instruct-hf:
gsm8k: 67
race-middle: 81
race-high: 84
phi-3-small-8k-instruct-hf:
gsm8k: 88
race-middle: 89
race-high: 88
qwen1.5-0.5b-chat-hf:
gsm8k: 5
race-middle: 55
race-high: 50
qwen2-1.5b-instruct-hf:
gsm8k: 63
race-middle: 77
race-high: 86
qwen2-1.5b-instruct-turbomind:
gsm8k: 60
race-middle: 77
race-high: 86
qwen2-7b-instruct-turbomind:
gsm8k: 88
race-middle: 87
race-high: 89
qwen2-7b-instruct-hf:
gsm8k: 85
race-middle: 87
race-high: 91
qwen1.5-0.5b-chat-vllm:
gsm8k: 5
race-middle: 57
race-high: 51
yi-1.5-6b-chat-hf:
gsm8k: 72
race-middle: 88
race-high: 86
yi-1.5-9b-chat-hf:
gsm8k: 81
race-middle: 89
race-high: 91
internlm2_5-7b-chat_hf:
demo_gsm8k: 87.50
race-middle: 92.76
race-high: 90.48
lmdeploy-api-test:
gsm8k: 90
race-middle: 95
race-high: 96
deepseek-moe-16b-base-hf:
gsm8k: 25
race-middle: 35
race-high: 23
deepseek-v2-lite-hf:
gsm8k: 37
race-middle: 56
race-high: 62
deepseek-7b-base-turbomind:
gsm8k: 21
race-middle: 42
race-high: 42
deepseek-moe-16b-base-vllm:
gsm8k: 22
race-middle: 35
race-high: 20
gemma-2b-hf:
gsm8k: 19
race-middle: 33
race-high: 26
gemma-7b-hf:
gsm8k: 65
race-middle: 59
race-high: 66
gemma2-2b-hf:
gsm8k: 33
race-middle: 56
race-high: 58
gemma2-9b-hf:
gsm8k: 70
race-middle: 82
race-high: 84
internlm2_5-7b-hf:
gsm8k: 47
race-middle: 92
race-high: 91
internlm2-7b-hf:
gsm8k: 65
race-middle: 77
race-high: 72
internlm2-base-7b-hf:
gsm8k: 5
race-middle: 71
race-high: 74
internlm2_5-7b-turbomind:
gsm8k: 73
race-middle: 90
race-high: 91
internlm2-1.8b-turbomind:
gsm8k: 25
race-middle: 75
race-high: 72
internlm2-7b-turbomind:
gsm8k: 67
race-middle: 78
race-high: 76
internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75
race-high: 81
llama-2-7b-hf:
gsm8k: 17
race-middle: 32
race-high: 38
llama-3-8b-hf:
gsm8k: 48
race-middle: 64
race-high: 70
llama-3.1-8b-turbomind:
gsm8k: 57
race-middle: 67
race-high: 75
llama-3-8b-turbomind:
gsm8k: 52
race-middle: 63
race-high: 70
mistral-7b-v0.2-hf:
gsm8k: 43
race-middle: 42
race-high: 60
mistral-7b-v0.3-hf:
gsm8k: 43
race-middle: 42
race-high: 60
mistral-7b-v0.2-vllm:
gsm8k: 45
race-middle: 42
race-high: 58
qwen1.5-moe-a2.7b-hf:
gsm8k: 64
race-middle: 78
race-high: 90
qwen2-1.5b-hf:
gsm8k: 58
race-middle: 65
race-high: 78
qwen2-0.5b-hf:
gsm8k: 35
race-middle: 52
race-high: 48
qwen2-7b-hf:
gsm8k: 82
race-middle: 88
race-high: 89
qwen2-1.5b-turbomind:
gsm8k: 57
race-middle: 64
race-high: 78
qwen2-7b-turbomind:
gsm8k: 83
race-middle: 88
race-high: 88
qwen1.5-0.5b-vllm:
gsm8k: 12
race-middle: 54
race-high: 59
yi-1.5-6b-hf:
gsm8k: 59
race-middle: 81
race-high: 89
yi-1.5-9b-hf:
gsm8k: 77
race-middle: 90
race-high: 90
gsm8k: 83.78
race-middle: 92.41
race-high: 90.37

View File

@ -0,0 +1,153 @@
internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75
ARC-c: 87.5
BoolQ: 81.25
drop: 81.25
GPQA_diamond: 25
math: 75
wikibench-wiki-single_choice_cncircular: 50
sanitized_mbpp: 68.75
ds1000: 16.96
gsm8k: 56.25
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
hellaswag: 87.5
TheoremQA: 18.75
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 72.6
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 12.5
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 12.5
ds1000_Matplotlib: 43.75
Alignbench总分: 0.65
Alignbench专业能力: 7.83
AlpacaEvaltotal: 0
AlpacaEvalhelpful_base: 0
CompassArenacompassarena_language: 60
CompassArenacompassarena_knowledge: 56
CompassArenacompassarena_reason_v2: 50
CompassArenacompassarena_math_v2: 53.5
CompassArenacompassarena_creationv2_zh: 48.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -3.3333333333333335
internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75
ARC-c: 87.5
BoolQ: 68.75
drop: 75
GPQA_diamond: 25
math: 75
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 68.75
ds1000: 13.39
gsm8k: 68.75
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
hellaswag: 81.25
TheoremQA: 6.25
college: 0
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 56.25
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 74.04
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 0
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 6.25
ds1000_Matplotlib: 37.5
Alignbench总分: 0.64
Alignbench专业能力: 7.6
AlpacaEvaltotal: 10
AlpacaEvalhelpful_base: 10
CompassArenacompassarena_language: 59
CompassArenacompassarena_knowledge: 57
CompassArenacompassarena_reason_v2: 49.5
CompassArenacompassarena_math_v2: 51
CompassArenacompassarena_creationv2_zh: 43.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -8.333333333333334
internlm2_5-7b-hf_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 12.5
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 37.5
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 75
hellaswag: 93.75
TheoremQA: 25
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 43.75
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75
internlm2_5-7b-turbomind_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 18.75
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 68.75
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 87.5
hellaswag: 93.75
TheoremQA: 31.25
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75

View File

@ -0,0 +1,459 @@
baichuan2-7b-chat-hf:
gsm8k: 18.75
race-high: 78.12
glm-4-9b-chat-hf:
gsm8k: 68.75
race-high: 90.62
glm-4-9b-chat-turbomind:
gsm8k: 75.00
race-high: 90.62
glm-4-9b-chat-vllm:
gsm8k: 65.62
race-high: 90.62
deepseek-7b-chat-hf:
gsm8k: 46.88
race-high: 81.25
deepseek-moe-16b-chat-hf:
gsm8k: 50
race-high: 68.75
deepseek-7b-chat-vllm:
gsm8k: 43.75
race-high: 75
gemma2-2b-it-hf:
gsm8k: 50
race-high: 71.88
gemma2-9b-it-hf:
gsm8k: 71.88
race-high: 84.38
gemma-2b-it-hf:
gsm8k: 3.12
race-high: 40.62
gemma-7b-it-hf:
gsm8k: 40.62
race-high: 68.75
gemma-2-9b-it-turbomind:
gsm8k: 68.75
race-high: 81.25
gemma-7b-it-vllm:
gsm8k: 28.12
race-high: 68.75
internlm2_5-7b-chat-hf:
gsm8k: 84.38
race-high: 90.62
internlm2_5-7b-chat-turbomind:
gsm8k: 84.38
race-high: 90.62
internlm2-chat-1.8b-turbomind:
gsm8k: 25
race-high: 84.38
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 21.88
race-high: 84.38
internlm2-chat-7b-lmdeploy:
gsm8k: 53.12
race-high: 84.38
internlm2-chat-7b-sft-turbomind:
gsm8k: 50
race-high: 90.62
internlm2-chat-7b-vllm:
gsm8k: 43.75
race-high: 87.5
llama-3_1-8b-instruct-hf:
gsm8k: 84.38
race-high: 90.62
llama-3_2-3b-instruct-hf:
gsm8k: 65.62
race-high: 81.25
llama-3-8b-instruct-hf:
gsm8k: 68.75
race-high: 87.5
llama-3_1-8b-instruct-turbomind:
gsm8k: 78.12
race-high: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k: 65.62
race-high: 81.25
llama-3-8b-instruct-turbomind:
gsm8k: 68.75
race-high: 87.5
mistral-7b-instruct-v0.2-hf:
gsm8k: 40.62
race-high: 75
mistral-7b-instruct-v0.3-hf:
gsm8k: 40.62
race-high: 75
mistral-nemo-instruct-2407-hf:
gsm8k: 75
race-high: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k: 75
race-high: 81.25
mistral-7b-instruct-v0.1-vllm:
gsm8k: 37.5
race-high: 71.88
mistral-7b-instruct-v0.2-vllm:
gsm8k: 43.75
race-high: 75
MiniCPM3-4B-hf:
gsm8k: 68.75
race-high: 84.38
minicpm-2b-dpo-fp32-hf:
gsm8k: 56.25
race-high: 56.25
minicpm-2b-sft-bf16-hf:
gsm8k: 46.88
race-high: 65.62
minicpm-2b-sft-fp32-hf:
gsm8k: 46.88
race-high: 65.62
phi-3-mini-4k-instruct-hf:
gsm8k: 56.25
race-high: 78.12
qwen1.5-0.5b-chat-hf:
gsm8k: 0
race-high: 53.12
qwen2-1.5b-instruct-hf:
gsm8k: 62.5
race-high: 84.38
qwen2-7b-instruct-hf:
gsm8k: 68.75
race-high: 90.62
qwen2-1.5b-instruct-turbomind:
gsm8k: 62.50
race-high: 84.38
qwen2-7b-instruct-turbomind:
gsm8k: 81.25
race-high: 87.5
qwen1.5-0.5b-chat-vllm:
gsm8k: 3.12
race-high: 53.12
yi-1.5-6b-chat-hf:
gsm8k: 65.62
race-high: 84.38
yi-1.5-9b-chat-hf:
gsm8k: 75
race-high: 93.75
deepseek-v2-lite-chat-hf:
gsm8k: 43.75
race-high: 71.88
internlm2_5-20b-chat-hf:
gsm8k: 84.38
race-high: 87.5
internlm2_5-20b-chat-turbomind:
gsm8k: 84.38
race-high: 87.5
mistral-small-instruct-2409-hf:
gsm8k: 81.25
race-high: 90.62
mistral-small-instruct-2409-turbomind:
gsm8k: 78.12
race-high: 90.62
qwen2.5-14b-instruct-hf:
gsm8k: 71.88
race-high: 93.75
qwen2.5-14b-instruct-turbomind:
gsm8k: 71.88
race-high: 93.75
glm-4-9b-hf:
gsm8k: 68.75
GPQA_diamond: 31.25
race-high: 93.75
winogrande: 84.38
deepseek-moe-16b-base-hf:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 21.88
winogrande: 65.62
deepseek-7b-base-turbomind:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 46.88
winogrande: 84.38
deepseek-moe-16b-base-vllm:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 25
winogrande: 68.75
gemma2-2b-hf:
gsm8k: 31.25
GPQA_diamond: 3.12
race-high: 56.25
winogrande: 71.88
gemma2-9b-hf:
gsm8k: 68.75
GPQA_diamond: 0
race-high: 81.25
winogrande: 84.38
gemma-2b-hf:
gsm8k: 18.75
GPQA_diamond: 3.12
race-high: 25
winogrande: 53.12
gemma-7b-hf:
gsm8k: 56.25
GPQA_diamond: 6.25
race-high: 65.62
winogrande: 78.12
gemma-2b-vllm:
gsm8k: 18.75
GPQA_diamond: 6.25
race-high:
winogrande:
gemma-7b-vllm:
gsm8k: 59.38
GPQA_diamond: 6.25
race-high:
winogrande:
internlm2_5-7b-hf:
gsm8k: 37.5
GPQA_diamond: 25
race-high: 93.75
winogrande: 71.88
internlm2-7b-hf:
gsm8k: 53.12
GPQA_diamond: 18.75
race-high: 62.5
winogrande: 78.12
internlm2-base-7b-hf:
gsm8k: 3.12
GPQA_diamond: 21.88
race-high: 75
winogrande: 65.62
internlm2-1.8b-turbomind:
gsm8k: 12.5
GPQA_diamond: 12.5
race-high: 71.88
winogrande: 75
internlm2_5-7b-turbomind:
gsm8k: 68.75
GPQA_diamond: 31.25
race-high: 93.75
winogrande: 84.38
internlm2-7b-turbomind:
gsm8k: 56.25
GPQA_diamond: 21.88
race-high: 75
winogrande: 81.25
internlm2-base-7b-turbomind:
gsm8k: 40.62
GPQA_diamond: 28.12
race-high: 84.38
winogrande: 71.88
llama-2-7b-hf:
gsm8k: 21.88
GPQA_diamond: 21.88
race-high: 40.62
winogrande: 71.88
llama-3_1-8b-hf:
gsm8k: 78.12
GPQA_diamond: 25
race-high: 90.62
winogrande: 62.5
llama-3-8b-hf:
gsm8k: 46.88
GPQA_diamond: 6.25
race-high: 65.62
winogrande: 65.62
llama-3.1-8b-turbomind:
gsm8k: 56.25
GPQA_diamond: 6.25
race-high: 78.12
winogrande: 78.12
llama-3-8b-turbomind:
gsm8k: 50
GPQA_diamond: 9.38
race-high: 65.62
winogrande: 78.12
mistral-7b-v0.2-hf:
gsm8k: 31.25
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 59.38
mistral-7b-v0.3-hf:
gsm8k: 31.25
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 59.38
mistral-7b-v0.2-vllm:
gsm8k: 34.38
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 65.62
qwen2.5-7b-hf:
gsm8k: 81.25
GPQA_diamond: 18.75
race-high: 87.5
winogrande: 71.88
qwen2.5-1.5b-turbomind:
gsm8k: 71.88
GPQA_diamond: 15.62
race-high: 78.12
winogrande: 71.88
qwen2.5-7b-turbomind:
gsm8k: 71.88
GPQA_diamond: 25
race-high: 87.5
winogrande: 71.88
qwen1.5-moe-a2.7b-hf:
gsm8k: 62.5
GPQA_diamond: 18.75
race-high: 84.38
winogrande: 75
qwen2-0.5b-hf:
gsm8k: 25
GPQA_diamond: 0
race-high: 40.62
winogrande: 62.5
qwen2-1.5b-hf:
gsm8k: 59.38
GPQA_diamond: 9.38
race-high: 81.25
winogrande: 62.5
qwen2-7b-hf:
gsm8k: 68.75
GPQA_diamond: 9.38
race-high: 87.5
winogrande: 68.75
qwen2-1.5b-turbomind:
gsm8k: 62.50
GPQA_diamond: 6.25
race-high: 81.25
winogrande: 75
qwen2-7b-turbomind:
gsm8k: 68.75
GPQA_diamond: 12.5
race-high: 87.5
winogrande: 71.88
qwen1.5-0.5b-vllm:
gsm8k: 9.38
GPQA_diamond: 0
race-high: 56.25
winogrande: 62.5
yi-1.5-6b-hf:
gsm8k: 62.5
GPQA_diamond: 3.12
race-high: 87.5
winogrande: 62.5
yi-1.5-9b-hf:
gsm8k: 75
GPQA_diamond: 40.62
race-high: 87.5
winogrande: 59.38
deepseek-v2-lite-hf:
gsm8k: 28.12
GPQA_diamond: 21.88
race-high: 59.38
winogrande: 75
internlm2-20b-hf:
gsm8k: 56.25
GPQA_diamond: 15.62
race-high: 68.75
winogrande: 75
internlm2-base-20b-hf:
gsm8k: 12.5
GPQA_diamond: 9.38
race-high: 84.38
winogrande: 65.62
internlm2-20b-turbomind:
gsm8k: 68.75
GPQA_diamond: 15.62
race-high: 68.75
winogrande: 81.25
qwen2.5-14b-hf:
gsm8k: 75
GPQA_diamond: 37.5
race-high: 93.75
winogrande: 84.38

View File

@ -13,11 +13,31 @@ on:
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
build_lmdeploy:
required: false
description: 'whether to build lmdeploy'
type: boolean
default: false
repo_org_lmdeploy:
required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy'
type: string
default: 'InternLM/lmdeploy'
repo_ref_lmdeploy:
required: false
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
regression_func:
required: true
description: 'regression functions'
type: string
default: "['chat','base','cmd']"
default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
cuda_env:
required: true
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
type: string
default: "['dsw_cu12']"
schedule:
- cron: '56 16 * * *'
@ -31,7 +51,7 @@ env:
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
@ -39,6 +59,8 @@ env:
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
jobs:
build-pypi:
@ -64,16 +86,51 @@ jobs:
retention-days: 1
name: my-artifact-${{ github.run_id }}
daily_run_test:
build-pypi-lmdeploy:
if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
strategy:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
DOCKER_TAG: cuda12.1
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
- name: Build
run: |
echo ${PYTHON_VERSION}
echo ${PLAT_NAME}
echo ${DOCKER_TAG}
echo ${OUTPUT_FOLDER}
echo ${GITHUB_RUN_ID}
# remove -it
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
if-no-files-found: error
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
retention-days: 1
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
prepare_env:
if: ${{!cancelled()}}
needs: build-pypi
needs: ['build-pypi', 'build-pypi-lmdeploy']
strategy:
fail-fast: false
matrix:
cuda_env: [dsw_cu11, dsw_cu12]
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 600 #10hours
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -84,89 +141,169 @@ jobs:
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Remove Conda Env
if: always()
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
- name: Prepare - create conda env and install torch - cu11
if: ${{matrix.cuda_env == 'dsw_cu11'}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
uses: nick-fields/retry@v3
id: retry1
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12
if: ${{matrix.cuda_env == 'dsw_cu12'}}
uses: nick-fields/retry@v3
id: retry2
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
pip install lmdeploy-*.whl --no-deps
daily_run_test:
if: ${{!cancelled()}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Prepare - prepare data and hf model
run: |
ln -s ${{env.DATEASET_CACHE_PATH}} data
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
- name: Run command testcase
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd')
if: matrix.regression_func == 'cmd'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat')
if: matrix.regression_func == 'chat_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base')
if: matrix.regression_func == 'base_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_obj_fullbench'
run: |
rm -rf regression_result_daily
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test - fullbench
if: matrix.regression_func == 'base_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api
if: matrix.regression_func == 'api'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api kill
if: always() && matrix.regression_func == 'api'
run: |
kill -15 "$restful_pid"
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}