mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[ci] react daily test (#1668)
* updaste * update * update * update * update * update * update * update * update * update * updaste * update * update * refactor summarize * update * update * update * update * update * updaste * update * update * update * update * updaste * update * update * update * update * update * updaste * updaste * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * Update daily-run-test.yml * update * update * update --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
This commit is contained in:
parent
3ec178f4a9
commit
a9d6b6461f
39
.github/scripts/eval_regression_api.py
vendored
Normal file
39
.github/scripts/eval_regression_api.py
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models.openai_api import OpenAISDK
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
|
||||
gsm8k_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.race.race_gen import \
|
||||
race_datasets # noqa: F401, E501
|
||||
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='lmdeploy-api-test',
|
||||
type=OpenAISDK,
|
||||
key='EMPTY',
|
||||
openai_api_base='http://localhost:23333/v1',
|
||||
path='internlm2',
|
||||
tokenizer_path='internlm/internlm2_5-7b-chat',
|
||||
rpm_verbose=True,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=128,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
temperature=0.01,
|
||||
batch_size=128,
|
||||
retry=20,
|
||||
)
|
||||
]
|
58
.github/scripts/eval_regression_base.py
vendored
58
.github/scripts/eval_regression_base.py
vendored
@ -2,15 +2,21 @@ from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||
gpqa_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
|
||||
gsm8k_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.race.race_ppl import \
|
||||
race_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||
winogrande_datasets # noqa: F401, E501
|
||||
# read hf models - chat models
|
||||
from opencompass.configs.models.chatglm.hf_glm4_9b import \
|
||||
models as hf_glm4_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
|
||||
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
||||
# read hf models - chat models
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
|
||||
@ -19,34 +25,58 @@ with read_base():
|
||||
models as hf_gemma2_2b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_9b import \
|
||||
models as hf_gemma2_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_2b import \
|
||||
models as hf_gemma_2b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_7b import \
|
||||
models as hf_gemma_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.vllm_gemma_2b import \
|
||||
models as vllm_gemma_2b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.vllm_gemma_7b import \
|
||||
models as vllm_gemma_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||
models as hf_internlm2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
|
||||
models as hf_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
|
||||
models as hf_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
|
||||
models as hf_internlm2_base_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
||||
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
|
||||
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
|
||||
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||
models as hf_llama2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
||||
models as hf_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||
models as hf_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||
models as lmdeploy_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
|
||||
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
|
||||
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
|
||||
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
|
||||
models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
|
||||
models as hf_qwen_2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
|
||||
models as hf_qwen_2_5_14b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
|
||||
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
|
||||
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
|
||||
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
|
||||
@ -65,11 +95,27 @@ with read_base():
|
||||
models as hf_yi_1_5_6b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
|
||||
models as hf_yi_1_5_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.medium import \
|
||||
summarizer # noqa: F401, E501
|
||||
|
||||
race_datasets = [race_datasets[1]]
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
for d in datasets:
|
||||
d['reader_cfg']['test_range'] = '[0:100]'
|
||||
d['reader_cfg']['test_range'] = '[0:32]'
|
||||
|
||||
for m in models:
|
||||
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||
m['engine_config']['max_batch_size'] = 1
|
||||
m['batch_size'] = 1
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['gsm8k', 'accuracy'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['race-high', 'accuracy'],
|
||||
['winogrande', 'accuracy'],
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
184
.github/scripts/eval_regression_base_fullbench.py
vendored
Normal file
184
.github/scripts/eval_regression_base_fullbench.py
vendored
Normal file
@ -0,0 +1,184 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
|
||||
ARC_c_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
|
||||
bbh_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
|
||||
cmmlu_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
|
||||
drop_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
|
||||
GaokaoBench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
|
||||
gpqa_datasets # noqa: F401, E501
|
||||
# Corebench v1.7
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
|
||||
gsm8k_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
|
||||
hellaswag_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
|
||||
humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
|
||||
humaneval_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
|
||||
math_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
|
||||
mathbench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
|
||||
sanitized_mbpp_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
|
||||
mmlu_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
|
||||
mmlu_pro_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
|
||||
nq_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.race.race_few_shot_ppl import \
|
||||
race_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
|
||||
BoolQ_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
|
||||
TheoremQA_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
|
||||
triviaqa_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
|
||||
wikibench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||
winogrande_datasets # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.bbh import \
|
||||
bbh_summary_groups # noqa: F401, E501
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.cmmlu import \
|
||||
cmmlu_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.GaokaoBench import \
|
||||
GaokaoBench_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
||||
mathbench_2024_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mmlu import \
|
||||
mmlu_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
||||
mmlu_pro_summary_groups # noqa: F401, E501
|
||||
|
||||
race_datasets = [race_datasets[1]] # Only take RACE-High
|
||||
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
|
||||
bbh_datasets = [
|
||||
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
|
||||
or 'multistep_arithmetic_two' in x['abbr']
|
||||
]
|
||||
cmmlu_datasets = [
|
||||
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
|
||||
'ancient_chinese', 'chinese_civil_service_exam',
|
||||
'chinese_driving_rule', 'chinese_food_culture',
|
||||
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
||||
'chinese_teacher_qualification', 'construction_project_management',
|
||||
'elementary_chinese', 'elementary_commonsense', 'ethnology',
|
||||
'high_school_politics', 'modern_chinese',
|
||||
'traditional_chinese_medicine'
|
||||
]
|
||||
]
|
||||
mmlu_datasets = [
|
||||
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
|
||||
'business_ethics', 'clinical_knowledge', 'college_medicine',
|
||||
'global_facts', 'human_aging', 'management', 'marketing',
|
||||
'medical_genetics', 'miscellaneous', 'nutrition',
|
||||
'professional_accounting', 'professional_medicine', 'virology'
|
||||
]
|
||||
]
|
||||
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
|
||||
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
|
||||
GaokaoBench_datasets = [
|
||||
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
|
||||
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
|
||||
]
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['race-high', 'accuracy'],
|
||||
['ARC-c', 'accuracy'],
|
||||
['BoolQ', 'accuracy'],
|
||||
['mmlu_pro', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['cmmlu', 'naive_average'],
|
||||
['mmlu', 'naive_average'],
|
||||
['drop', 'accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['math', 'accuracy'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['openai_humaneval_v2', 'humaneval_pass@1'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['GaokaoBench', 'weighted_average'],
|
||||
['triviaqa_wiki_1shot', 'score'],
|
||||
['nq_open_1shot', 'score'],
|
||||
['winogrande', 'accuracy'],
|
||||
['hellaswag', 'accuracy'],
|
||||
['TheoremQA', 'score'],
|
||||
'###### MathBench-A: Application Part ######',
|
||||
'college',
|
||||
'high',
|
||||
'middle',
|
||||
'primary',
|
||||
'arithmetic',
|
||||
'mathbench-a (average)',
|
||||
'###### MathBench-T: Theory Part ######',
|
||||
'college_knowledge',
|
||||
'high_knowledge',
|
||||
'middle_knowledge',
|
||||
'primary_knowledge',
|
||||
'mathbench-t (average)',
|
||||
'###### Overall: Average between MathBench-A and MathBench-T ######',
|
||||
'Overall',
|
||||
'',
|
||||
'bbh-logical_deduction_seven_objects',
|
||||
'bbh-multistep_arithmetic_two',
|
||||
'',
|
||||
'mmlu',
|
||||
'mmlu-stem',
|
||||
'mmlu-social-science',
|
||||
'mmlu-humanities',
|
||||
['mmlu-other', 'accuracy'],
|
||||
'cmmlu',
|
||||
'cmmlu-stem',
|
||||
'cmmlu-social-science',
|
||||
'cmmlu-humanities',
|
||||
'cmmlu-other',
|
||||
['cmmlu-china-specific', 'accuracy'],
|
||||
'mmlu_pro',
|
||||
'mmlu_pro_biology',
|
||||
'mmlu_pro_business',
|
||||
'mmlu_pro_chemistry',
|
||||
'mmlu_pro_computer_science',
|
||||
'mmlu_pro_economics',
|
||||
'mmlu_pro_engineering',
|
||||
'mmlu_pro_health',
|
||||
'mmlu_pro_history',
|
||||
'mmlu_pro_law',
|
||||
'mmlu_pro_math',
|
||||
'mmlu_pro_philosophy',
|
||||
'mmlu_pro_physics',
|
||||
'mmlu_pro_psychology',
|
||||
'mmlu_pro_other',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
for d in datasets:
|
||||
d['reader_cfg']['test_range'] = '[0:16]'
|
||||
|
||||
for m in models:
|
||||
m['abbr'] = m['abbr'] + '_fullbench'
|
||||
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||
m['engine_config']['max_batch_size'] = 1
|
||||
m['batch_size'] = 1
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
78
.github/scripts/eval_regression_chat.py
vendored
78
.github/scripts/eval_regression_chat.py
vendored
@ -1,7 +1,5 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
|
||||
@ -29,6 +27,12 @@ with read_base():
|
||||
models as hf_gemma2_2b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
|
||||
models as hf_gemma2_9b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
|
||||
models as hf_gemma_2b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
|
||||
models as hf_gemma_7b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
|
||||
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
|
||||
models as vllm_gemma_7b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||
@ -51,18 +55,35 @@ with read_base():
|
||||
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
|
||||
models as hf_llama3_2_3b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
|
||||
models as hf_llama3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
||||
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
|
||||
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
|
||||
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
|
||||
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
|
||||
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
|
||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
|
||||
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||
models as \
|
||||
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
|
||||
models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
|
||||
models as hf_minicpm3_4b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
|
||||
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
|
||||
@ -73,6 +94,10 @@ with read_base():
|
||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
|
||||
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
|
||||
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
|
||||
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
|
||||
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
|
||||
@ -89,10 +114,8 @@ with read_base():
|
||||
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
|
||||
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.medium import \
|
||||
summarizer # noqa: F401, E501
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
race_datasets = [race_datasets[1]]
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
api_meta_template = dict(
|
||||
@ -103,25 +126,24 @@ api_meta_template = dict(
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
model_name = ''
|
||||
|
||||
models.append(
|
||||
dict(
|
||||
abbr='lmdeploy-api-test',
|
||||
type=OpenAISDK,
|
||||
key='EMPTY',
|
||||
openai_api_base='http://judgemodel:10001/v1',
|
||||
path='compass_judger_internlm2_102b_0508',
|
||||
tokenizer_path='internlm/internlm2_5-20b-chat',
|
||||
rpm_verbose=True,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=50,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
temperature=0.01,
|
||||
batch_size=128,
|
||||
retry=3,
|
||||
))
|
||||
|
||||
for d in datasets:
|
||||
d['reader_cfg']['test_range'] = '[0:100]'
|
||||
d['reader_cfg']['test_range'] = '[0:32]'
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
for m in models:
|
||||
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||
m['engine_config']['max_batch_size'] = 1
|
||||
m['batch_size'] = 1
|
||||
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'gsm8k',
|
||||
'race-middle',
|
||||
'race-high',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
246
.github/scripts/eval_regression_chat_objective_fullbench.py
vendored
Normal file
246
.github/scripts/eval_regression_chat_objective_fullbench.py
vendored
Normal file
@ -0,0 +1,246 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
# read hf models - chat models
|
||||
# Dataset
|
||||
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
|
||||
ARC_c_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
|
||||
bbh_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
|
||||
cmmlu_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
|
||||
drop_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
|
||||
ds1000_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
|
||||
GaokaoBench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||
gpqa_datasets # noqa: F401, E501
|
||||
# new datasets in Fullbench v1.1
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
||||
gsm8k_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
|
||||
hellaswag_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
|
||||
humaneval_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
|
||||
humanevalx_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
|
||||
ifeval_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
|
||||
math_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
|
||||
mathbench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
|
||||
sanitized_mbpp_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
|
||||
mmlu_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
|
||||
mmlu_pro_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
|
||||
nq_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
|
||||
race_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
|
||||
SciCode_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
|
||||
BoolQ_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
|
||||
teval_datasets as teval_en_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
|
||||
teval_datasets as teval_zh_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
|
||||
TheoremQA_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
|
||||
triviaqa_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
|
||||
wikibench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.bbh import \
|
||||
bbh_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.cmmlu import \
|
||||
cmmlu_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.ds1000 import \
|
||||
ds1000_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.GaokaoBench import \
|
||||
GaokaoBench_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.humanevalx import \
|
||||
humanevalx_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
||||
mathbench_2024_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mmlu import \
|
||||
mmlu_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
||||
mmlu_pro_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.scicode import \
|
||||
scicode_summary_groups # noqa: F401, E501
|
||||
from opencompass.configs.summarizers.groups.teval import \
|
||||
teval_summary_groups # noqa: F401, E501
|
||||
|
||||
# For HumanEval-X Evaluation
|
||||
# Apply the evaluator ip_address and port
|
||||
race_datasets = [race_datasets[1]]
|
||||
for item in humanevalx_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
|
||||
# For DS-1000 Evaluation
|
||||
# Apply the evaluator ip_address and port
|
||||
for item in ds1000_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
|
||||
bbh_datasets = [
|
||||
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
|
||||
or 'multistep_arithmetic_two' in x['abbr']
|
||||
]
|
||||
cmmlu_datasets = [
|
||||
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
|
||||
'ancient_chinese', 'chinese_civil_service_exam',
|
||||
'chinese_driving_rule', 'chinese_food_culture',
|
||||
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
||||
'chinese_teacher_qualification', 'construction_project_management',
|
||||
'elementary_chinese', 'elementary_commonsense', 'ethnology',
|
||||
'high_school_politics', 'modern_chinese',
|
||||
'traditional_chinese_medicine'
|
||||
]
|
||||
]
|
||||
mmlu_datasets = [
|
||||
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
|
||||
'business_ethics', 'clinical_knowledge', 'college_medicine',
|
||||
'global_facts', 'human_aging', 'management', 'marketing',
|
||||
'medical_genetics', 'miscellaneous', 'nutrition',
|
||||
'professional_accounting', 'professional_medicine', 'virology'
|
||||
]
|
||||
]
|
||||
|
||||
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
|
||||
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
|
||||
GaokaoBench_datasets = [
|
||||
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
|
||||
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
|
||||
]
|
||||
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')
|
||||
and 'scicode' not in k.lower() and 'teval' not in k),
|
||||
[],
|
||||
)
|
||||
datasets += teval_en_datasets
|
||||
datasets += teval_zh_datasets
|
||||
# datasets += SciCode_datasets
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['race-high', 'accuracy'],
|
||||
['ARC-c', 'accuracy'],
|
||||
['BoolQ', 'accuracy'],
|
||||
['mmlu_pro', 'naive_average'],
|
||||
['drop', 'accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['math', 'accuracy'],
|
||||
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
['cmmlu', 'naive_average'],
|
||||
['mmlu', 'naive_average'],
|
||||
['teval', 'naive_average'],
|
||||
['SciCode', 'accuracy'],
|
||||
['SciCode', 'sub_accuracy'],
|
||||
['humanevalx', 'naive_average'],
|
||||
['ds1000', 'naive_average'],
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['GaokaoBench', 'weighted_average'],
|
||||
['triviaqa_wiki_1shot', 'score'],
|
||||
['nq_open_1shot', 'score'],
|
||||
['hellaswag', 'accuracy'],
|
||||
['TheoremQA', 'score'],
|
||||
'###### MathBench-A: Application Part ######',
|
||||
'college',
|
||||
'high',
|
||||
'middle',
|
||||
'primary',
|
||||
'arithmetic',
|
||||
'mathbench-a (average)',
|
||||
'###### MathBench-T: Theory Part ######',
|
||||
'college_knowledge',
|
||||
'high_knowledge',
|
||||
'middle_knowledge',
|
||||
'primary_knowledge',
|
||||
'mathbench-t (average)',
|
||||
'###### Overall: Average between MathBench-A and MathBench-T ######',
|
||||
'Overall',
|
||||
'',
|
||||
'bbh-logical_deduction_seven_objects',
|
||||
'bbh-multistep_arithmetic_two',
|
||||
''
|
||||
'mmlu',
|
||||
'mmlu-stem',
|
||||
'mmlu-social-science',
|
||||
'mmlu-humanities',
|
||||
'mmlu-other',
|
||||
'',
|
||||
'cmmlu',
|
||||
'cmmlu-stem',
|
||||
'cmmlu-social-science',
|
||||
'cmmlu-humanities',
|
||||
'cmmlu-other',
|
||||
'cmmlu-china-specific',
|
||||
'',
|
||||
'mmlu_pro',
|
||||
'mmlu_pro_biology',
|
||||
'mmlu_pro_business',
|
||||
'mmlu_pro_chemistry',
|
||||
'mmlu_pro_computer_science',
|
||||
'mmlu_pro_economics',
|
||||
'mmlu_pro_engineering',
|
||||
'mmlu_pro_health',
|
||||
'mmlu_pro_history',
|
||||
'mmlu_pro_law',
|
||||
'mmlu_pro_math',
|
||||
'mmlu_pro_philosophy',
|
||||
'mmlu_pro_physics',
|
||||
'mmlu_pro_psychology',
|
||||
'mmlu_pro_other',
|
||||
'',
|
||||
'GaokaoBench_2010-2022_Math_II_MCQs',
|
||||
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
|
||||
'',
|
||||
'humanevalx-python',
|
||||
'humanevalx-cpp',
|
||||
'humanevalx-go',
|
||||
'humanevalx-java',
|
||||
'humanevalx-js',
|
||||
'',
|
||||
'ds1000_Pandas',
|
||||
'ds1000_Numpy',
|
||||
'ds1000_Tensorflow',
|
||||
'ds1000_Scipy',
|
||||
'ds1000_Sklearn',
|
||||
'ds1000_Pytorch',
|
||||
'ds1000_Matplotlib',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
for d in datasets:
|
||||
d['reader_cfg']['test_range'] = '[0:16]'
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
for m in models:
|
||||
m['abbr'] = m['abbr'] + '_fullbench'
|
||||
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||
m['engine_config']['max_batch_size'] = 1
|
||||
m['batch_size'] = 1
|
||||
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
70
.github/scripts/eval_regression_chat_subjective_fullbench.py
vendored
Normal file
70
.github/scripts/eval_regression_chat_subjective_fullbench.py
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
from copy import deepcopy
|
||||
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.summarizers import SubjectiveSummarizer
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
|
||||
with read_base():
|
||||
# read hf models - chat models
|
||||
# Dataset
|
||||
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
|
||||
alignbench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
|
||||
alpacav2_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
|
||||
arenahard_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
|
||||
compassarena_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
|
||||
fofo_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
|
||||
followbench_llmeval_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
|
||||
mtbench101_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
|
||||
wildbench_datasets # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
|
||||
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
|
||||
and 'mtbench101' not in k and 'wildbench' not in k), [])
|
||||
datasets += mtbench101_datasets # noqa: F401, E501
|
||||
datasets += wildbench_datasets # noqa: F401, E501
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
for m in models:
|
||||
m['abbr'] = m['abbr'] + '_fullbench'
|
||||
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||
m['engine_config']['max_batch_size'] = 1
|
||||
m['batch_size'] = 1
|
||||
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||
|
||||
judge_models = deepcopy([models[1]])
|
||||
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
314
.github/scripts/oc_score_assert.py
vendored
314
.github/scripts/oc_score_assert.py
vendored
@ -7,36 +7,56 @@ import yaml
|
||||
output_path = 'regression_result_daily'
|
||||
|
||||
chat_model_list = [
|
||||
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
|
||||
'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
|
||||
'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
|
||||
'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
|
||||
'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
|
||||
'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
|
||||
'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
|
||||
'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
|
||||
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
|
||||
'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
|
||||
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
|
||||
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
|
||||
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
|
||||
'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
|
||||
'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
|
||||
'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
|
||||
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
|
||||
'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
|
||||
'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
|
||||
'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
|
||||
'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
|
||||
'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
|
||||
'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
|
||||
'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
|
||||
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
|
||||
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
|
||||
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
|
||||
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
|
||||
'lmdeploy-api-test'
|
||||
'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
|
||||
'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
|
||||
'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
|
||||
'qwen2.5-14b-instruct-turbomind'
|
||||
]
|
||||
base_model_list = [
|
||||
'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
|
||||
'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
|
||||
'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
|
||||
'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
|
||||
'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
|
||||
'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
|
||||
'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
|
||||
'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
|
||||
'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
|
||||
'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
|
||||
'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
|
||||
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
|
||||
'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
|
||||
'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
|
||||
'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
|
||||
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
|
||||
'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
|
||||
'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
|
||||
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
|
||||
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
|
||||
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
|
||||
'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
|
||||
'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
|
||||
]
|
||||
dataset_list = ['gsm8k', 'race-middle', 'race-high']
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def baseline_scores_testrange(request):
|
||||
config_path = os.path.join(
|
||||
request.config.rootdir,
|
||||
'.github/scripts/oc_score_baseline_testrange.yaml')
|
||||
with open(config_path) as f:
|
||||
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
@ -48,6 +68,16 @@ def baseline_scores(request):
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def baseline_scores_fullbench(request):
|
||||
config_path = os.path.join(
|
||||
request.config.rootdir,
|
||||
'.github/scripts/oc_score_baseline_fullbench.yaml')
|
||||
with open(config_path) as f:
|
||||
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def result_scores():
|
||||
file = find_csv_files(output_path)
|
||||
@ -57,100 +87,228 @@ def result_scores():
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores_testrange')
|
||||
@pytest.mark.chat
|
||||
class TestChat:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2)
|
||||
for p1 in chat_model_list
|
||||
for p2 in dataset_list])
|
||||
def test_model_dataset_score(self, baseline_scores, result_scores, model,
|
||||
dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[(p1, p2) for p1 in chat_model_list
|
||||
for p2 in ['gsm8k', 'race-high']])
|
||||
def test_model_dataset_score(self, baseline_scores_testrange,
|
||||
result_scores, model, dataset):
|
||||
base_score = baseline_scores_testrange.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, base_score)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores_testrange')
|
||||
@pytest.mark.base
|
||||
class TestBase:
|
||||
"""Test cases for base model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2)
|
||||
for p1 in base_model_list
|
||||
for p2 in dataset_list])
|
||||
def test_model_dataset_score(self, baseline_scores, result_scores, model,
|
||||
dataset):
|
||||
if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
[(p1, p2) for p1 in base_model_list
|
||||
for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
|
||||
def test_model_dataset_score(self, baseline_scores_testrange,
|
||||
result_scores, model, dataset):
|
||||
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
|
||||
return
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
base_score = baseline_scores_testrange.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, base_score)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||
@pytest.mark.chat_obj_fullbench
|
||||
class TestChatObjFullbench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||
'internlm2_5-7b-chat-hf_fullbench',
|
||||
'internlm2_5-7b-chat-turbomind_fullbench'
|
||||
] for p2 in [
|
||||
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
|
||||
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
|
||||
'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
|
||||
'TheoremQA', 'college', 'college_knowledge',
|
||||
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
|
||||
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
|
||||
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
|
||||
'ds1000_Pytorch', 'ds1000_Matplotlib'
|
||||
]])
|
||||
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||
result_scores, model, dataset):
|
||||
base_score = baseline_scores_fullbench.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||
@pytest.mark.chat_sub_fullbench
|
||||
class TestChatSubFullbench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||
'internlm2_5-7b-chat-hf_fullbench',
|
||||
'internlm2_5-7b-chat-turbomind_fullbench'
|
||||
] for p2 in [
|
||||
'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
|
||||
'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
|
||||
'CompassArenacompassarena_knowledge',
|
||||
'CompassArenacompassarena_reason_v2',
|
||||
'CompassArenacompassarena_math_v2',
|
||||
'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
|
||||
'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
|
||||
'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
|
||||
'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
|
||||
'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
|
||||
'MTBench101average', 'Wildbenchscore'
|
||||
]])
|
||||
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||
result_scores, model, dataset):
|
||||
base_score = baseline_scores_fullbench.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||
@pytest.mark.base_fullbench
|
||||
class TestBaseFullbench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
|
||||
] for p2 in [
|
||||
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
|
||||
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
|
||||
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
|
||||
'TheoremQA', 'college', 'college_knowledge',
|
||||
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
|
||||
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
|
||||
]])
|
||||
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||
result_scores, model, dataset):
|
||||
base_score = baseline_scores_fullbench.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores')
|
||||
@pytest.mark.api
|
||||
class TestApibench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('lmdeploy-api-test', 'race-middle'),
|
||||
('lmdeploy-api-test', 'race-high'),
|
||||
('lmdeploy-api-test', 'gsm8k')])
|
||||
def test_api(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('result_scores')
|
||||
@pytest.mark.usefixtures('baseline_scores')
|
||||
class TestCmdCase:
|
||||
|
||||
@pytest.mark.case1
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b-hf', 'race-middle'),
|
||||
('internlm2_5-7b-hf', 'race-high')])
|
||||
def test_cmd_case1(self, result_scores, model, dataset):
|
||||
if len(result_scores.keys()) != 1:
|
||||
assert False, 'result is none'
|
||||
('internlm2_5-7b-hf', 'race-high'),
|
||||
('internlm2_5-7b-hf', 'demo_gsm8k'),
|
||||
('internlm2-1.8b-hf', 'race-middle'),
|
||||
('internlm2-1.8b-hf', 'race-high'),
|
||||
('internlm2-1.8b-hf', 'demo_gsm8k')])
|
||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, 91)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
@pytest.mark.case2
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'race-high')])
|
||||
def test_cmd_case2(self, result_scores, model, dataset):
|
||||
if len(result_scores.keys()) != 1:
|
||||
assert False, 'result is none'
|
||||
('internlm2_5-7b-chat-lmdeploy', 'race-high'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-high'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
|
||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, 91)
|
||||
assert_score(model + '_batch', result_score, base_score)
|
||||
|
||||
@pytest.mark.case3
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b_hf', 'race-middle'),
|
||||
('internlm2_5-7b_hf', 'race-high')])
|
||||
def test_cmd_case3(self, result_scores, model, dataset):
|
||||
if len(result_scores.keys()) != 1:
|
||||
assert False, 'result is none'
|
||||
('internlm2_5-7b_hf', 'race-high'),
|
||||
('internlm2_5-7b_hf', 'demo_gsm8k')])
|
||||
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, 91)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
@pytest.mark.case4
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b-chat_hf', 'race-middle'),
|
||||
('internlm2_5-7b-chat_hf', 'race-high')])
|
||||
def test_cmd_case4(self, result_scores, model, dataset):
|
||||
if len(result_scores.keys()) != 1:
|
||||
assert False, 'result is none'
|
||||
('internlm2_5-7b-chat_hf', 'race-high'),
|
||||
('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
|
||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(result_score, 91)
|
||||
assert_score(model, result_score, base_score)
|
||||
|
||||
|
||||
def assert_score(score, baseline):
|
||||
THRESHOLD = 3
|
||||
|
||||
|
||||
def assert_score(model_type, score, baseline):
|
||||
if score is None or score == '-':
|
||||
assert False, 'value is none'
|
||||
if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
|
||||
print(score + ' between ' + str(baseline - 5) + ' and ' +
|
||||
str(baseline + 5))
|
||||
assert True
|
||||
|
||||
if 'batch' not in model_type:
|
||||
if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
|
||||
0.01):
|
||||
print(' '.join([score, 'is equal', str(baseline)]))
|
||||
assert True
|
||||
else:
|
||||
print(' '.join([score, 'is not equal', str(baseline)]))
|
||||
assert False, ' '.join([score, 'is not equal', str(baseline)])
|
||||
else:
|
||||
assert False, score + ' not between ' + str(
|
||||
baseline - 5) + ' and ' + str(baseline + 5)
|
||||
if float(score) <= (baseline + THRESHOLD) and float(score) >= (
|
||||
baseline - THRESHOLD):
|
||||
print(' '.join([
|
||||
score, 'is between',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
]))
|
||||
assert True
|
||||
else:
|
||||
print(' '.join([
|
||||
score, 'is not etween',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
]))
|
||||
assert False, ' '.join([
|
||||
score, 'is not etween',
|
||||
str(baseline - THRESHOLD), 'and',
|
||||
str(baseline + THRESHOLD)
|
||||
])
|
||||
|
||||
|
||||
def find_csv_files(directory):
|
||||
csv_files = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith('.csv'):
|
||||
if file.endswith('.csv') and (file.startswith('summary') or
|
||||
file.startswith('Subjective_all')):
|
||||
csv_files.append(os.path.join(root, file))
|
||||
|
||||
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
|
||||
@ -163,14 +321,24 @@ def read_csv_file(file_path):
|
||||
with open(file_path, 'r') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
filtered_data = []
|
||||
|
||||
for row in reader:
|
||||
filtered_row = {
|
||||
k: v
|
||||
for k, v in row.items()
|
||||
if k not in ['version', 'metric', 'mode']
|
||||
}
|
||||
filtered_data.append(filtered_row)
|
||||
if 'Subjective_all' not in file_path:
|
||||
for row in reader:
|
||||
if row['metric'] is not None and 'bpb' not in row['metric']:
|
||||
filtered_row = {
|
||||
k: v
|
||||
for k, v in row.items()
|
||||
if k not in ['version', 'metric', 'mode']
|
||||
}
|
||||
filtered_data.append(filtered_row)
|
||||
else:
|
||||
for row in reader:
|
||||
if row['Detailed Scores'] is not None:
|
||||
filtered_row = row
|
||||
filtered_row['dataset'] = filtered_row[
|
||||
'Dataset'] + filtered_row['Detailed Scores']
|
||||
del filtered_row['Dataset']
|
||||
del filtered_row['Detailed Scores']
|
||||
filtered_data.append(filtered_row)
|
||||
|
||||
result = {}
|
||||
for data in filtered_data:
|
||||
|
389
.github/scripts/oc_score_baseline.yaml
vendored
389
.github/scripts/oc_score_baseline.yaml
vendored
@ -1,369 +1,34 @@
|
||||
baichuan2-7b-chat-hf:
|
||||
gsm8k: 30
|
||||
race-middle: 74
|
||||
race-high: 79
|
||||
internlm2_5-7b-hf:
|
||||
demo_gsm8k: 42.19
|
||||
race-middle: 91.78
|
||||
race-high: 90.02
|
||||
|
||||
glm-4-9b-chat-hf:
|
||||
gsm8k: 75
|
||||
race-middle: 88
|
||||
race-high: 88
|
||||
internlm2_5-7b_hf:
|
||||
demo_gsm8k: 42.19
|
||||
race-middle: 91.78
|
||||
race-high: 90.02
|
||||
|
||||
glm-4-9b-chat-turbomind:
|
||||
gsm8k: 69
|
||||
race-middle: 82
|
||||
race-high: 77
|
||||
internlm2-1.8b-hf:
|
||||
demo_gsm8k: 15.62
|
||||
race-middle: 71.66
|
||||
race-high: 66.38
|
||||
|
||||
glm-4-9b-chat-vllm:
|
||||
gsm8k: 73
|
||||
race-middle: 87
|
||||
race-high: 87
|
||||
internlm2_5-7b-chat-lmdeploy:
|
||||
demo_gsm8k: 84.38
|
||||
race-middle: 92.76
|
||||
race-high: 90.54
|
||||
|
||||
deepseek-7b-chat-hf:
|
||||
gsm8k: 60
|
||||
race-middle: 74
|
||||
race-high: 80
|
||||
internlm2-chat-1.8b-lmdeploy:
|
||||
demo_gsm8k: 31
|
||||
race-middle: 81.34
|
||||
race-high: 73.96
|
||||
|
||||
deepseek-moe-16b-chat-hf:
|
||||
gsm8k: 62
|
||||
race-middle: 62
|
||||
race-high: 70
|
||||
|
||||
deepseek-v2-lite-chat-hf:
|
||||
gsm8k: 59
|
||||
race-middle: 82
|
||||
race-high: 79
|
||||
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k: 63
|
||||
race-middle: 74
|
||||
race-high: 79
|
||||
|
||||
gemma-2b-it-hf:
|
||||
gsm8k: 14
|
||||
race-middle: 62
|
||||
race-high: 52
|
||||
|
||||
gemma-7b-it-hf:
|
||||
gsm8k: 39
|
||||
race-middle: 74
|
||||
race-high: 71
|
||||
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k: 38
|
||||
race-middle: 75
|
||||
race-high: 70
|
||||
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k: 62
|
||||
race-middle: 75
|
||||
race-high: 67
|
||||
|
||||
gemma2-9b-it-hf:
|
||||
gsm8k: 80
|
||||
race-middle: 89
|
||||
race-high: 85
|
||||
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k: 86
|
||||
race-middle: 92
|
||||
race-high: 93
|
||||
|
||||
internlm2_5-20b-chat-hf:
|
||||
gsm8k: 91
|
||||
race-middle: 95
|
||||
race-high: 91
|
||||
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
gsm8k: 87
|
||||
race-middle: 92
|
||||
race-high: 93
|
||||
|
||||
internlm2_5-20b-chat-turbomind:
|
||||
gsm8k: 91
|
||||
race-middle: 95
|
||||
race-high: 91
|
||||
|
||||
internlm2-chat-1.8b-turbomind:
|
||||
gsm8k: 40
|
||||
race-middle: 82
|
||||
race-high: 83
|
||||
|
||||
internlm2-chat-1.8b-sft-turbomind:
|
||||
gsm8k: 34
|
||||
race-middle: 81
|
||||
race-high: 83
|
||||
|
||||
internlm2-chat-7b-lmdeploy:
|
||||
gsm8k: 69
|
||||
race-middle: 90
|
||||
race-high: 88
|
||||
|
||||
internlm2-chat-7b-sft-turbomind:
|
||||
gsm8k: 71
|
||||
race-middle: 91
|
||||
race-high: 92
|
||||
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k: 63
|
||||
race-middle: 90
|
||||
race-high: 91
|
||||
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k: 82
|
||||
race-middle: 82
|
||||
race-high: 88
|
||||
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k: 77
|
||||
race-middle: 85
|
||||
race-high: 87
|
||||
|
||||
llama-3_1-8b-instruct-turbomind:
|
||||
gsm8k: 79
|
||||
race-middle: 82
|
||||
race-high: 88
|
||||
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k: 77
|
||||
race-middle: 85
|
||||
race-high: 89
|
||||
|
||||
mistral-7b-instruct-v0.2-hf:
|
||||
gsm8k: 48
|
||||
race-middle: 82
|
||||
race-high: 78
|
||||
|
||||
mistral-7b-instruct-v0.3-hf:
|
||||
gsm8k: 53
|
||||
race-middle: 80
|
||||
race-high: 78
|
||||
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k: 49
|
||||
race-middle: 81
|
||||
race-high: 77
|
||||
|
||||
minicpm-2b-dpo-fp32-hf:
|
||||
gsm8k: 58
|
||||
race-middle: 66
|
||||
race-high: 74
|
||||
|
||||
minicpm-2b-sft-bf16-hf:
|
||||
gsm8k: 58
|
||||
race-middle: 75
|
||||
race-high: 81
|
||||
|
||||
minicpm-2b-sft-fp32-hf:
|
||||
gsm8k: 58
|
||||
race-middle: 75
|
||||
race-high: 81
|
||||
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k: 67
|
||||
race-middle: 81
|
||||
race-high: 84
|
||||
|
||||
phi-3-small-8k-instruct-hf:
|
||||
gsm8k: 88
|
||||
race-middle: 89
|
||||
race-high: 88
|
||||
|
||||
qwen1.5-0.5b-chat-hf:
|
||||
gsm8k: 5
|
||||
race-middle: 55
|
||||
race-high: 50
|
||||
|
||||
qwen2-1.5b-instruct-hf:
|
||||
gsm8k: 63
|
||||
race-middle: 77
|
||||
race-high: 86
|
||||
|
||||
qwen2-1.5b-instruct-turbomind:
|
||||
gsm8k: 60
|
||||
race-middle: 77
|
||||
race-high: 86
|
||||
|
||||
qwen2-7b-instruct-turbomind:
|
||||
gsm8k: 88
|
||||
race-middle: 87
|
||||
race-high: 89
|
||||
|
||||
qwen2-7b-instruct-hf:
|
||||
gsm8k: 85
|
||||
race-middle: 87
|
||||
race-high: 91
|
||||
|
||||
qwen1.5-0.5b-chat-vllm:
|
||||
gsm8k: 5
|
||||
race-middle: 57
|
||||
race-high: 51
|
||||
|
||||
yi-1.5-6b-chat-hf:
|
||||
gsm8k: 72
|
||||
race-middle: 88
|
||||
race-high: 86
|
||||
|
||||
yi-1.5-9b-chat-hf:
|
||||
gsm8k: 81
|
||||
race-middle: 89
|
||||
race-high: 91
|
||||
internlm2_5-7b-chat_hf:
|
||||
demo_gsm8k: 87.50
|
||||
race-middle: 92.76
|
||||
race-high: 90.48
|
||||
|
||||
lmdeploy-api-test:
|
||||
gsm8k: 90
|
||||
race-middle: 95
|
||||
race-high: 96
|
||||
|
||||
deepseek-moe-16b-base-hf:
|
||||
gsm8k: 25
|
||||
race-middle: 35
|
||||
race-high: 23
|
||||
|
||||
deepseek-v2-lite-hf:
|
||||
gsm8k: 37
|
||||
race-middle: 56
|
||||
race-high: 62
|
||||
|
||||
deepseek-7b-base-turbomind:
|
||||
gsm8k: 21
|
||||
race-middle: 42
|
||||
race-high: 42
|
||||
|
||||
deepseek-moe-16b-base-vllm:
|
||||
gsm8k: 22
|
||||
race-middle: 35
|
||||
race-high: 20
|
||||
|
||||
gemma-2b-hf:
|
||||
gsm8k: 19
|
||||
race-middle: 33
|
||||
race-high: 26
|
||||
|
||||
gemma-7b-hf:
|
||||
gsm8k: 65
|
||||
race-middle: 59
|
||||
race-high: 66
|
||||
|
||||
gemma2-2b-hf:
|
||||
gsm8k: 33
|
||||
race-middle: 56
|
||||
race-high: 58
|
||||
|
||||
gemma2-9b-hf:
|
||||
gsm8k: 70
|
||||
race-middle: 82
|
||||
race-high: 84
|
||||
|
||||
internlm2_5-7b-hf:
|
||||
gsm8k: 47
|
||||
race-middle: 92
|
||||
race-high: 91
|
||||
|
||||
internlm2-7b-hf:
|
||||
gsm8k: 65
|
||||
race-middle: 77
|
||||
race-high: 72
|
||||
|
||||
internlm2-base-7b-hf:
|
||||
gsm8k: 5
|
||||
race-middle: 71
|
||||
race-high: 74
|
||||
|
||||
internlm2_5-7b-turbomind:
|
||||
gsm8k: 73
|
||||
race-middle: 90
|
||||
race-high: 91
|
||||
|
||||
internlm2-1.8b-turbomind:
|
||||
gsm8k: 25
|
||||
race-middle: 75
|
||||
race-high: 72
|
||||
|
||||
internlm2-7b-turbomind:
|
||||
gsm8k: 67
|
||||
race-middle: 78
|
||||
race-high: 76
|
||||
|
||||
internlm2-base-7b-turbomind:
|
||||
gsm8k: 39
|
||||
race-middle: 75
|
||||
race-high: 81
|
||||
|
||||
llama-2-7b-hf:
|
||||
gsm8k: 17
|
||||
race-middle: 32
|
||||
race-high: 38
|
||||
|
||||
llama-3-8b-hf:
|
||||
gsm8k: 48
|
||||
race-middle: 64
|
||||
race-high: 70
|
||||
|
||||
llama-3.1-8b-turbomind:
|
||||
gsm8k: 57
|
||||
race-middle: 67
|
||||
race-high: 75
|
||||
|
||||
llama-3-8b-turbomind:
|
||||
gsm8k: 52
|
||||
race-middle: 63
|
||||
race-high: 70
|
||||
|
||||
mistral-7b-v0.2-hf:
|
||||
gsm8k: 43
|
||||
race-middle: 42
|
||||
race-high: 60
|
||||
|
||||
mistral-7b-v0.3-hf:
|
||||
gsm8k: 43
|
||||
race-middle: 42
|
||||
race-high: 60
|
||||
|
||||
mistral-7b-v0.2-vllm:
|
||||
gsm8k: 45
|
||||
race-middle: 42
|
||||
race-high: 58
|
||||
|
||||
qwen1.5-moe-a2.7b-hf:
|
||||
gsm8k: 64
|
||||
race-middle: 78
|
||||
race-high: 90
|
||||
|
||||
qwen2-1.5b-hf:
|
||||
gsm8k: 58
|
||||
race-middle: 65
|
||||
race-high: 78
|
||||
|
||||
qwen2-0.5b-hf:
|
||||
gsm8k: 35
|
||||
race-middle: 52
|
||||
race-high: 48
|
||||
|
||||
qwen2-7b-hf:
|
||||
gsm8k: 82
|
||||
race-middle: 88
|
||||
race-high: 89
|
||||
|
||||
qwen2-1.5b-turbomind:
|
||||
gsm8k: 57
|
||||
race-middle: 64
|
||||
race-high: 78
|
||||
|
||||
qwen2-7b-turbomind:
|
||||
gsm8k: 83
|
||||
race-middle: 88
|
||||
race-high: 88
|
||||
|
||||
qwen1.5-0.5b-vllm:
|
||||
gsm8k: 12
|
||||
race-middle: 54
|
||||
race-high: 59
|
||||
|
||||
yi-1.5-6b-hf:
|
||||
gsm8k: 59
|
||||
race-middle: 81
|
||||
race-high: 89
|
||||
|
||||
yi-1.5-9b-hf:
|
||||
gsm8k: 77
|
||||
race-middle: 90
|
||||
race-high: 90
|
||||
gsm8k: 83.78
|
||||
race-middle: 92.41
|
||||
race-high: 90.37
|
||||
|
153
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
Normal file
153
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
Normal file
@ -0,0 +1,153 @@
|
||||
internlm2_5-7b-chat-hf_fullbench:
|
||||
race-high: 93.75
|
||||
ARC-c: 87.5
|
||||
BoolQ: 81.25
|
||||
drop: 81.25
|
||||
GPQA_diamond: 25
|
||||
math: 75
|
||||
wikibench-wiki-single_choice_cncircular: 50
|
||||
sanitized_mbpp: 68.75
|
||||
ds1000: 16.96
|
||||
gsm8k: 56.25
|
||||
triviaqa_wiki_1shot: 50
|
||||
nq_open_1shot: 25
|
||||
hellaswag: 87.5
|
||||
TheoremQA: 18.75
|
||||
college: 12.5
|
||||
college_knowledge: 87.5
|
||||
bbh-logical_deduction_seven_objects: 50
|
||||
bbh-multistep_arithmetic_two: 68.75
|
||||
mmlu-other: 72.6
|
||||
cmmlu-china-specific: 76.25
|
||||
mmlu_pro_math: 25
|
||||
ds1000_Pandas: 12.5
|
||||
ds1000_Numpy: 0
|
||||
ds1000_Tensorflow: 12.5
|
||||
ds1000_Scipy: 18.75
|
||||
ds1000_Sklearn: 18.75
|
||||
ds1000_Pytorch: 12.5
|
||||
ds1000_Matplotlib: 43.75
|
||||
Alignbench总分: 0.65
|
||||
Alignbench专业能力: 7.83
|
||||
AlpacaEvaltotal: 0
|
||||
AlpacaEvalhelpful_base: 0
|
||||
CompassArenacompassarena_language: 60
|
||||
CompassArenacompassarena_knowledge: 56
|
||||
CompassArenacompassarena_reason_v2: 50
|
||||
CompassArenacompassarena_math_v2: 53.5
|
||||
CompassArenacompassarena_creationv2_zh: 48.75
|
||||
Fofofofo_test_prompts: 1
|
||||
followbenchHSR_AVG: 1
|
||||
followbenchSSR_AVG: 1
|
||||
followbenchHSR_L1: 1
|
||||
followbenchHSR_L2: 1
|
||||
followbenchHSR_L3: 1
|
||||
followbenchHSR_L4: 1
|
||||
followbenchHSR_L5: 1
|
||||
followbenchSSR_L1: 1
|
||||
followbenchSSR_L2: 1
|
||||
followbenchSSR_L3: 1
|
||||
followbenchSSR_L4: 1
|
||||
followbenchSSR_L5: 1
|
||||
MTBench101average: 8.1
|
||||
Wildbenchscore: -3.3333333333333335
|
||||
|
||||
internlm2_5-7b-chat-turbomind_fullbench:
|
||||
race-high: 93.75
|
||||
ARC-c: 87.5
|
||||
BoolQ: 68.75
|
||||
drop: 75
|
||||
GPQA_diamond: 25
|
||||
math: 75
|
||||
wikibench-wiki-single_choice_cncircular: 25
|
||||
sanitized_mbpp: 68.75
|
||||
ds1000: 13.39
|
||||
gsm8k: 68.75
|
||||
triviaqa_wiki_1shot: 50
|
||||
nq_open_1shot: 25
|
||||
hellaswag: 81.25
|
||||
TheoremQA: 6.25
|
||||
college: 0
|
||||
college_knowledge: 87.5
|
||||
bbh-logical_deduction_seven_objects: 56.25
|
||||
bbh-multistep_arithmetic_two: 68.75
|
||||
mmlu-other: 74.04
|
||||
cmmlu-china-specific: 76.25
|
||||
mmlu_pro_math: 25
|
||||
ds1000_Pandas: 0
|
||||
ds1000_Numpy: 0
|
||||
ds1000_Tensorflow: 12.5
|
||||
ds1000_Scipy: 18.75
|
||||
ds1000_Sklearn: 18.75
|
||||
ds1000_Pytorch: 6.25
|
||||
ds1000_Matplotlib: 37.5
|
||||
Alignbench总分: 0.64
|
||||
Alignbench专业能力: 7.6
|
||||
AlpacaEvaltotal: 10
|
||||
AlpacaEvalhelpful_base: 10
|
||||
CompassArenacompassarena_language: 59
|
||||
CompassArenacompassarena_knowledge: 57
|
||||
CompassArenacompassarena_reason_v2: 49.5
|
||||
CompassArenacompassarena_math_v2: 51
|
||||
CompassArenacompassarena_creationv2_zh: 43.75
|
||||
Fofofofo_test_prompts: 1
|
||||
followbenchHSR_AVG: 1
|
||||
followbenchSSR_AVG: 1
|
||||
followbenchHSR_L1: 1
|
||||
followbenchHSR_L2: 1
|
||||
followbenchHSR_L3: 1
|
||||
followbenchHSR_L4: 1
|
||||
followbenchHSR_L5: 1
|
||||
followbenchSSR_L1: 1
|
||||
followbenchSSR_L2: 1
|
||||
followbenchSSR_L3: 1
|
||||
followbenchSSR_L4: 1
|
||||
followbenchSSR_L5: 1
|
||||
MTBench101average: 8.1
|
||||
Wildbenchscore: -8.333333333333334
|
||||
|
||||
internlm2_5-7b-hf_fullbench:
|
||||
race-high: 100
|
||||
ARC-c: 68.75
|
||||
BoolQ: 87.5
|
||||
GPQA_diamond: 62.5
|
||||
drop: 62.5
|
||||
math: 12.5
|
||||
wikibench-wiki-single_choice_cncircular: 25
|
||||
sanitized_mbpp: 56.25
|
||||
gsm8k: 37.5
|
||||
triviaqa_wiki_1shot: 43.75
|
||||
nq_open_1shot: 43.75
|
||||
winogrande: 75
|
||||
hellaswag: 93.75
|
||||
TheoremQA: 25
|
||||
college: 12.5
|
||||
college_knowledge: 87.5
|
||||
bbh-logical_deduction_seven_objects: 43.75
|
||||
bbh-multistep_arithmetic_two: 56.25
|
||||
mmlu-other: 76.92
|
||||
cmmlu-china-specific: 84.17
|
||||
mmlu_pro_math: 18.75
|
||||
|
||||
internlm2_5-7b-turbomind_fullbench:
|
||||
race-high: 100
|
||||
ARC-c: 68.75
|
||||
BoolQ: 87.5
|
||||
GPQA_diamond: 62.5
|
||||
drop: 62.5
|
||||
math: 18.75
|
||||
wikibench-wiki-single_choice_cncircular: 25
|
||||
sanitized_mbpp: 56.25
|
||||
gsm8k: 68.75
|
||||
triviaqa_wiki_1shot: 43.75
|
||||
nq_open_1shot: 43.75
|
||||
winogrande: 87.5
|
||||
hellaswag: 93.75
|
||||
TheoremQA: 31.25
|
||||
college: 12.5
|
||||
college_knowledge: 87.5
|
||||
bbh-logical_deduction_seven_objects: 50
|
||||
bbh-multistep_arithmetic_two: 56.25
|
||||
mmlu-other: 76.92
|
||||
cmmlu-china-specific: 84.17
|
||||
mmlu_pro_math: 18.75
|
459
.github/scripts/oc_score_baseline_testrange.yaml
vendored
Normal file
459
.github/scripts/oc_score_baseline_testrange.yaml
vendored
Normal file
@ -0,0 +1,459 @@
|
||||
baichuan2-7b-chat-hf:
|
||||
gsm8k: 18.75
|
||||
race-high: 78.12
|
||||
|
||||
glm-4-9b-chat-hf:
|
||||
gsm8k: 68.75
|
||||
race-high: 90.62
|
||||
|
||||
glm-4-9b-chat-turbomind:
|
||||
gsm8k: 75.00
|
||||
race-high: 90.62
|
||||
|
||||
glm-4-9b-chat-vllm:
|
||||
gsm8k: 65.62
|
||||
race-high: 90.62
|
||||
|
||||
deepseek-7b-chat-hf:
|
||||
gsm8k: 46.88
|
||||
race-high: 81.25
|
||||
|
||||
deepseek-moe-16b-chat-hf:
|
||||
gsm8k: 50
|
||||
race-high: 68.75
|
||||
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k: 43.75
|
||||
race-high: 75
|
||||
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k: 50
|
||||
race-high: 71.88
|
||||
|
||||
gemma2-9b-it-hf:
|
||||
gsm8k: 71.88
|
||||
race-high: 84.38
|
||||
|
||||
gemma-2b-it-hf:
|
||||
gsm8k: 3.12
|
||||
race-high: 40.62
|
||||
|
||||
gemma-7b-it-hf:
|
||||
gsm8k: 40.62
|
||||
race-high: 68.75
|
||||
|
||||
gemma-2-9b-it-turbomind:
|
||||
gsm8k: 68.75
|
||||
race-high: 81.25
|
||||
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k: 28.12
|
||||
race-high: 68.75
|
||||
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k: 84.38
|
||||
race-high: 90.62
|
||||
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
gsm8k: 84.38
|
||||
race-high: 90.62
|
||||
|
||||
internlm2-chat-1.8b-turbomind:
|
||||
gsm8k: 25
|
||||
race-high: 84.38
|
||||
|
||||
internlm2-chat-1.8b-sft-turbomind:
|
||||
gsm8k: 21.88
|
||||
race-high: 84.38
|
||||
|
||||
internlm2-chat-7b-lmdeploy:
|
||||
gsm8k: 53.12
|
||||
race-high: 84.38
|
||||
|
||||
internlm2-chat-7b-sft-turbomind:
|
||||
gsm8k: 50
|
||||
race-high: 90.62
|
||||
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k: 43.75
|
||||
race-high: 87.5
|
||||
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k: 84.38
|
||||
race-high: 90.62
|
||||
|
||||
llama-3_2-3b-instruct-hf:
|
||||
gsm8k: 65.62
|
||||
race-high: 81.25
|
||||
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k: 68.75
|
||||
race-high: 87.5
|
||||
|
||||
llama-3_1-8b-instruct-turbomind:
|
||||
gsm8k: 78.12
|
||||
race-high: 90.62
|
||||
|
||||
llama-3_2-3b-instruct-turbomind:
|
||||
gsm8k: 65.62
|
||||
race-high: 81.25
|
||||
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k: 68.75
|
||||
race-high: 87.5
|
||||
|
||||
mistral-7b-instruct-v0.2-hf:
|
||||
gsm8k: 40.62
|
||||
race-high: 75
|
||||
|
||||
mistral-7b-instruct-v0.3-hf:
|
||||
gsm8k: 40.62
|
||||
race-high: 75
|
||||
|
||||
mistral-nemo-instruct-2407-hf:
|
||||
gsm8k: 75
|
||||
race-high: 81.25
|
||||
|
||||
mistral-nemo-instruct-2407-turbomind:
|
||||
gsm8k: 75
|
||||
race-high: 81.25
|
||||
|
||||
mistral-7b-instruct-v0.1-vllm:
|
||||
gsm8k: 37.5
|
||||
race-high: 71.88
|
||||
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k: 43.75
|
||||
race-high: 75
|
||||
|
||||
MiniCPM3-4B-hf:
|
||||
gsm8k: 68.75
|
||||
race-high: 84.38
|
||||
|
||||
minicpm-2b-dpo-fp32-hf:
|
||||
gsm8k: 56.25
|
||||
race-high: 56.25
|
||||
|
||||
minicpm-2b-sft-bf16-hf:
|
||||
gsm8k: 46.88
|
||||
race-high: 65.62
|
||||
|
||||
minicpm-2b-sft-fp32-hf:
|
||||
gsm8k: 46.88
|
||||
race-high: 65.62
|
||||
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k: 56.25
|
||||
race-high: 78.12
|
||||
|
||||
qwen1.5-0.5b-chat-hf:
|
||||
gsm8k: 0
|
||||
race-high: 53.12
|
||||
|
||||
qwen2-1.5b-instruct-hf:
|
||||
gsm8k: 62.5
|
||||
race-high: 84.38
|
||||
|
||||
qwen2-7b-instruct-hf:
|
||||
gsm8k: 68.75
|
||||
race-high: 90.62
|
||||
|
||||
qwen2-1.5b-instruct-turbomind:
|
||||
gsm8k: 62.50
|
||||
race-high: 84.38
|
||||
|
||||
qwen2-7b-instruct-turbomind:
|
||||
gsm8k: 81.25
|
||||
race-high: 87.5
|
||||
|
||||
qwen1.5-0.5b-chat-vllm:
|
||||
gsm8k: 3.12
|
||||
race-high: 53.12
|
||||
|
||||
yi-1.5-6b-chat-hf:
|
||||
gsm8k: 65.62
|
||||
race-high: 84.38
|
||||
|
||||
yi-1.5-9b-chat-hf:
|
||||
gsm8k: 75
|
||||
race-high: 93.75
|
||||
|
||||
deepseek-v2-lite-chat-hf:
|
||||
gsm8k: 43.75
|
||||
race-high: 71.88
|
||||
|
||||
internlm2_5-20b-chat-hf:
|
||||
gsm8k: 84.38
|
||||
race-high: 87.5
|
||||
|
||||
internlm2_5-20b-chat-turbomind:
|
||||
gsm8k: 84.38
|
||||
race-high: 87.5
|
||||
|
||||
mistral-small-instruct-2409-hf:
|
||||
gsm8k: 81.25
|
||||
race-high: 90.62
|
||||
|
||||
mistral-small-instruct-2409-turbomind:
|
||||
gsm8k: 78.12
|
||||
race-high: 90.62
|
||||
|
||||
qwen2.5-14b-instruct-hf:
|
||||
gsm8k: 71.88
|
||||
race-high: 93.75
|
||||
|
||||
qwen2.5-14b-instruct-turbomind:
|
||||
gsm8k: 71.88
|
||||
race-high: 93.75
|
||||
|
||||
glm-4-9b-hf:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 31.25
|
||||
race-high: 93.75
|
||||
winogrande: 84.38
|
||||
|
||||
deepseek-moe-16b-base-hf:
|
||||
gsm8k: 21.88
|
||||
GPQA_diamond: 0
|
||||
race-high: 21.88
|
||||
winogrande: 65.62
|
||||
|
||||
deepseek-7b-base-turbomind:
|
||||
gsm8k: 21.88
|
||||
GPQA_diamond: 0
|
||||
race-high: 46.88
|
||||
winogrande: 84.38
|
||||
|
||||
deepseek-moe-16b-base-vllm:
|
||||
gsm8k: 21.88
|
||||
GPQA_diamond: 0
|
||||
race-high: 25
|
||||
winogrande: 68.75
|
||||
|
||||
gemma2-2b-hf:
|
||||
gsm8k: 31.25
|
||||
GPQA_diamond: 3.12
|
||||
race-high: 56.25
|
||||
winogrande: 71.88
|
||||
|
||||
gemma2-9b-hf:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 0
|
||||
race-high: 81.25
|
||||
winogrande: 84.38
|
||||
|
||||
gemma-2b-hf:
|
||||
gsm8k: 18.75
|
||||
GPQA_diamond: 3.12
|
||||
race-high: 25
|
||||
winogrande: 53.12
|
||||
|
||||
gemma-7b-hf:
|
||||
gsm8k: 56.25
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 65.62
|
||||
winogrande: 78.12
|
||||
|
||||
gemma-2b-vllm:
|
||||
gsm8k: 18.75
|
||||
GPQA_diamond: 6.25
|
||||
race-high:
|
||||
winogrande:
|
||||
|
||||
gemma-7b-vllm:
|
||||
gsm8k: 59.38
|
||||
GPQA_diamond: 6.25
|
||||
race-high:
|
||||
winogrande:
|
||||
|
||||
internlm2_5-7b-hf:
|
||||
gsm8k: 37.5
|
||||
GPQA_diamond: 25
|
||||
race-high: 93.75
|
||||
winogrande: 71.88
|
||||
|
||||
internlm2-7b-hf:
|
||||
gsm8k: 53.12
|
||||
GPQA_diamond: 18.75
|
||||
race-high: 62.5
|
||||
winogrande: 78.12
|
||||
|
||||
internlm2-base-7b-hf:
|
||||
gsm8k: 3.12
|
||||
GPQA_diamond: 21.88
|
||||
race-high: 75
|
||||
winogrande: 65.62
|
||||
|
||||
internlm2-1.8b-turbomind:
|
||||
gsm8k: 12.5
|
||||
GPQA_diamond: 12.5
|
||||
race-high: 71.88
|
||||
winogrande: 75
|
||||
|
||||
internlm2_5-7b-turbomind:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 31.25
|
||||
race-high: 93.75
|
||||
winogrande: 84.38
|
||||
|
||||
internlm2-7b-turbomind:
|
||||
gsm8k: 56.25
|
||||
GPQA_diamond: 21.88
|
||||
race-high: 75
|
||||
winogrande: 81.25
|
||||
|
||||
internlm2-base-7b-turbomind:
|
||||
gsm8k: 40.62
|
||||
GPQA_diamond: 28.12
|
||||
race-high: 84.38
|
||||
winogrande: 71.88
|
||||
|
||||
llama-2-7b-hf:
|
||||
gsm8k: 21.88
|
||||
GPQA_diamond: 21.88
|
||||
race-high: 40.62
|
||||
winogrande: 71.88
|
||||
|
||||
llama-3_1-8b-hf:
|
||||
gsm8k: 78.12
|
||||
GPQA_diamond: 25
|
||||
race-high: 90.62
|
||||
winogrande: 62.5
|
||||
|
||||
llama-3-8b-hf:
|
||||
gsm8k: 46.88
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 65.62
|
||||
winogrande: 65.62
|
||||
|
||||
llama-3.1-8b-turbomind:
|
||||
gsm8k: 56.25
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 78.12
|
||||
winogrande: 78.12
|
||||
|
||||
llama-3-8b-turbomind:
|
||||
gsm8k: 50
|
||||
GPQA_diamond: 9.38
|
||||
race-high: 65.62
|
||||
winogrande: 78.12
|
||||
|
||||
mistral-7b-v0.2-hf:
|
||||
gsm8k: 31.25
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 62.5
|
||||
winogrande: 59.38
|
||||
|
||||
mistral-7b-v0.3-hf:
|
||||
gsm8k: 31.25
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 62.5
|
||||
winogrande: 59.38
|
||||
|
||||
mistral-7b-v0.2-vllm:
|
||||
gsm8k: 34.38
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 62.5
|
||||
winogrande: 65.62
|
||||
|
||||
qwen2.5-7b-hf:
|
||||
gsm8k: 81.25
|
||||
GPQA_diamond: 18.75
|
||||
race-high: 87.5
|
||||
winogrande: 71.88
|
||||
|
||||
qwen2.5-1.5b-turbomind:
|
||||
gsm8k: 71.88
|
||||
GPQA_diamond: 15.62
|
||||
race-high: 78.12
|
||||
winogrande: 71.88
|
||||
|
||||
qwen2.5-7b-turbomind:
|
||||
gsm8k: 71.88
|
||||
GPQA_diamond: 25
|
||||
race-high: 87.5
|
||||
winogrande: 71.88
|
||||
|
||||
qwen1.5-moe-a2.7b-hf:
|
||||
gsm8k: 62.5
|
||||
GPQA_diamond: 18.75
|
||||
race-high: 84.38
|
||||
winogrande: 75
|
||||
|
||||
qwen2-0.5b-hf:
|
||||
gsm8k: 25
|
||||
GPQA_diamond: 0
|
||||
race-high: 40.62
|
||||
winogrande: 62.5
|
||||
|
||||
qwen2-1.5b-hf:
|
||||
gsm8k: 59.38
|
||||
GPQA_diamond: 9.38
|
||||
race-high: 81.25
|
||||
winogrande: 62.5
|
||||
|
||||
qwen2-7b-hf:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 9.38
|
||||
race-high: 87.5
|
||||
winogrande: 68.75
|
||||
|
||||
qwen2-1.5b-turbomind:
|
||||
gsm8k: 62.50
|
||||
GPQA_diamond: 6.25
|
||||
race-high: 81.25
|
||||
winogrande: 75
|
||||
|
||||
qwen2-7b-turbomind:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 12.5
|
||||
race-high: 87.5
|
||||
winogrande: 71.88
|
||||
|
||||
qwen1.5-0.5b-vllm:
|
||||
gsm8k: 9.38
|
||||
GPQA_diamond: 0
|
||||
race-high: 56.25
|
||||
winogrande: 62.5
|
||||
|
||||
yi-1.5-6b-hf:
|
||||
gsm8k: 62.5
|
||||
GPQA_diamond: 3.12
|
||||
race-high: 87.5
|
||||
winogrande: 62.5
|
||||
|
||||
yi-1.5-9b-hf:
|
||||
gsm8k: 75
|
||||
GPQA_diamond: 40.62
|
||||
race-high: 87.5
|
||||
winogrande: 59.38
|
||||
|
||||
deepseek-v2-lite-hf:
|
||||
gsm8k: 28.12
|
||||
GPQA_diamond: 21.88
|
||||
race-high: 59.38
|
||||
winogrande: 75
|
||||
|
||||
internlm2-20b-hf:
|
||||
gsm8k: 56.25
|
||||
GPQA_diamond: 15.62
|
||||
race-high: 68.75
|
||||
winogrande: 75
|
||||
|
||||
internlm2-base-20b-hf:
|
||||
gsm8k: 12.5
|
||||
GPQA_diamond: 9.38
|
||||
race-high: 84.38
|
||||
winogrande: 65.62
|
||||
|
||||
internlm2-20b-turbomind:
|
||||
gsm8k: 68.75
|
||||
GPQA_diamond: 15.62
|
||||
race-high: 68.75
|
||||
winogrande: 81.25
|
||||
|
||||
qwen2.5-14b-hf:
|
||||
gsm8k: 75
|
||||
GPQA_diamond: 37.5
|
||||
race-high: 93.75
|
||||
winogrande: 84.38
|
241
.github/workflows/daily-run-test.yml
vendored
241
.github/workflows/daily-run-test.yml
vendored
@ -13,11 +13,31 @@ on:
|
||||
description: 'Set branch or tag or commit id. Default is "main"'
|
||||
type: string
|
||||
default: 'main'
|
||||
build_lmdeploy:
|
||||
required: false
|
||||
description: 'whether to build lmdeploy'
|
||||
type: boolean
|
||||
default: false
|
||||
repo_org_lmdeploy:
|
||||
required: false
|
||||
description: 'Tested repository organization name. Default is internlm/lmdeploy'
|
||||
type: string
|
||||
default: 'InternLM/lmdeploy'
|
||||
repo_ref_lmdeploy:
|
||||
required: false
|
||||
description: 'Set branch or tag or commit id. Default is "main"'
|
||||
type: string
|
||||
default: 'main'
|
||||
regression_func:
|
||||
required: true
|
||||
description: 'regression functions'
|
||||
type: string
|
||||
default: "['chat','base','cmd']"
|
||||
default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
|
||||
cuda_env:
|
||||
required: true
|
||||
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
|
||||
type: string
|
||||
default: "['dsw_cu12']"
|
||||
schedule:
|
||||
- cron: '56 16 * * *'
|
||||
|
||||
@ -31,7 +51,7 @@ env:
|
||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
|
||||
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
|
||||
HF_DATASETS_OFFLINE: 1
|
||||
HF_EVALUATE_OFFLINE: 1
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
@ -39,6 +59,8 @@ env:
|
||||
LMDEPLOY_USE_MODELSCOPE: false
|
||||
HF_HUB_OFFLINE: 1
|
||||
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
|
||||
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
|
||||
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
build-pypi:
|
||||
@ -64,16 +86,51 @@ jobs:
|
||||
retention-days: 1
|
||||
name: my-artifact-${{ github.run_id }}
|
||||
|
||||
daily_run_test:
|
||||
build-pypi-lmdeploy:
|
||||
if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
|
||||
strategy:
|
||||
matrix:
|
||||
pyver: [py310]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||
PLAT_NAME: manylinux2014_x86_64
|
||||
DOCKER_TAG: cuda12.1
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
|
||||
ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
|
||||
- name: Build
|
||||
run: |
|
||||
echo ${PYTHON_VERSION}
|
||||
echo ${PLAT_NAME}
|
||||
echo ${DOCKER_TAG}
|
||||
echo ${OUTPUT_FOLDER}
|
||||
echo ${GITHUB_RUN_ID}
|
||||
# remove -it
|
||||
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
|
||||
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
|
||||
- name: Upload Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
if-no-files-found: error
|
||||
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
|
||||
retention-days: 1
|
||||
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
|
||||
|
||||
|
||||
prepare_env:
|
||||
if: ${{!cancelled()}}
|
||||
needs: build-pypi
|
||||
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
cuda_env: [dsw_cu11, dsw_cu12]
|
||||
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
|
||||
runs-on: ${{ matrix.cuda_env }}
|
||||
environment: 'prod'
|
||||
timeout-minutes: 600 #10hours
|
||||
timeout-minutes: 240 #4hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
@ -84,89 +141,169 @@ jobs:
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: my-artifact-${{ github.run_id }}
|
||||
- name: Remove Conda Env
|
||||
if: always()
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
- name: Prepare - create conda env and install torch - cu11
|
||||
if: ${{matrix.cuda_env == 'dsw_cu11'}}
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip uninstall torch torchvision torchaudio -y
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
uses: nick-fields/retry@v3
|
||||
id: retry1
|
||||
with:
|
||||
max_attempts: 3
|
||||
timeout_minutes: 40
|
||||
command: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip uninstall torch torchvision torchaudio -y
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
- name: Prepare - create conda env and install torch - cu12
|
||||
if: ${{matrix.cuda_env == 'dsw_cu12'}}
|
||||
uses: nick-fields/retry@v3
|
||||
id: retry2
|
||||
with:
|
||||
max_attempts: 3
|
||||
timeout_minutes: 40
|
||||
command: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: my-artifact-${{ github.run_id }}-py310
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip uninstall torch torchvision torchaudio -y
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
pip install lmdeploy-*.whl --no-deps
|
||||
|
||||
daily_run_test:
|
||||
if: ${{!cancelled()}}
|
||||
needs: prepare_env
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
|
||||
runs-on: ${{ matrix.cuda_env }}
|
||||
environment: 'prod'
|
||||
timeout-minutes: 240 #4hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||
- name: Prepare - prepare data and hf model
|
||||
run: |
|
||||
ln -s ${{env.DATEASET_CACHE_PATH}} data
|
||||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
|
||||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
|
||||
- name: Run command testcase
|
||||
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd')
|
||||
if: matrix.regression_func == 'cmd'
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
export from_tf=TRUE
|
||||
python tools/list_configs.py internlm2_5 mmlu
|
||||
opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run chat model test
|
||||
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat')
|
||||
if: matrix.regression_func == 'chat_models'
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
|
||||
opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run base model test
|
||||
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base')
|
||||
if: matrix.regression_func == 'base_models'
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Remove Conda Env
|
||||
if: always()
|
||||
- name: Run chat model test - fullbench
|
||||
if: matrix.regression_func == 'chat_obj_fullbench'
|
||||
run: |
|
||||
rm -rf regression_result_daily
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run chat model test - fullbench
|
||||
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||
env:
|
||||
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run base model test - fullbench
|
||||
if: matrix.regression_func == 'base_fullbench'
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run model test - api
|
||||
if: matrix.regression_func == 'api'
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 120s
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run model test - api kill
|
||||
if: always() && matrix.regression_func == 'api'
|
||||
run: |
|
||||
kill -15 "$restful_pid"
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
|
Loading…
Reference in New Issue
Block a user