This commit is contained in:
zhulinJulia24 2025-05-12 20:51:36 +08:00
parent a685ed7daf
commit c269cc054d
5 changed files with 50 additions and 50 deletions

View File

@ -58,7 +58,7 @@ for m in models:
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
judge_models = deepcopy([models[1]]) judge_models = deepcopy([hf_internlm2_5_7b_chat_model])
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge' judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
eval = dict( eval = dict(

View File

@ -9,7 +9,7 @@ internlm2_5-7b_hf:
race-high_accuracy: 90.02 race-high_accuracy: 90.02
internlm2_5-7b-chat-lmdeploy: internlm2_5-7b-chat-lmdeploy:
demo_gsm8k_accuracy: 87.50 demo_gsm8k_accuracy: 84.38
race-middle_accuracy: 92.76 race-middle_accuracy: 92.76
race-high_accuracy: 90.54 race-high_accuracy: 90.54
@ -34,6 +34,6 @@ internlm2_5-7b-chat_hf:
race-high_accuracy: 90.48 race-high_accuracy: 90.48
lmdeploy-api-test: lmdeploy-api-test:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 68.75
race-middle_accuracy: 93.75 race-middle_accuracy: 93.75
race-high_accuracy: 93.75 race-high_accuracy: 93.75

View File

@ -78,39 +78,39 @@ internlm2_5-7b-chat-hf_fullbench:
internlm2_5-7b-chat-turbomind_fullbench: internlm2_5-7b-chat-turbomind_fullbench:
objective: objective:
race-high_accuracy: 93.75 race-high_accuracy: 93.75
ARC-c_accuracy: 87.50 ARC-c_accuracy: 93.75
BoolQ_accuracy: 68.75 BoolQ_accuracy: 75.00
triviaqa_wiki_1shot_score: 50 triviaqa_wiki_1shot_score: 50
nq_open_1shot_score: 25 nq_open_1shot_score: 25
IFEval_Prompt-level-strict-accuracy: 56.25 IFEval_Prompt-level-strict-accuracy: 56.25
drop_accuracy: 75 drop_accuracy: 75
GPQA_diamond_accuracy: 31.25 GPQA_diamond_accuracy: 37.50
hellaswag_accuracy: 87.5 hellaswag_accuracy: 81.25
TheoremQA_score: 12.5 TheoremQA_score: 12.5
musr_average_naive_average: 39.58 musr_average_naive_average: 39.58
korbench_single_naive_average: 40 korbench_single_naive_average: 40
gsm8k_accuracy: 62.5 gsm8k_accuracy: 68.75
math_accuracy: 75 math_accuracy: 68.75
cmo_fib_accuracy: 6.25 cmo_fib_accuracy: 6.25
aime2024_accuracy: 6.25 aime2024_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 25 wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 68.75 sanitized_mbpp_score: 68.75
ds1000_naive_average: 17.86 ds1000_naive_average: 15.18
lcb_code_generation_pass@1: 12.5 lcb_code_generation_pass@1: 12.5
lcb_code_execution_pass@1: 43.75 lcb_code_execution_pass@1: 43.75
lcb_test_output_pass@1: 18.75 lcb_test_output_pass@1: 0.00
bbh-logical_deduction_seven_objects_score: 56.25 bbh-logical_deduction_seven_objects_score: 62.50
bbh-multistep_arithmetic_two_score: 75 bbh-multistep_arithmetic_two_score: 62.50
mmlu-other_accuracy: 72.6 mmlu-other_accuracy: 73.08
cmmlu-china-specific_accuracy: 78.33 cmmlu-china-specific_accuracy: 75.42
mmlu_pro_math_accuracy: 31.25 mmlu_pro_math_accuracy: 25.00
ds1000_Pandas_accuracy: 12.5 ds1000_Pandas_accuracy: 0.00
ds1000_Numpy_accuracy: 0 ds1000_Numpy_accuracy: 0
ds1000_Tensorflow_accuracy: 12.5 ds1000_Tensorflow_accuracy: 12.5
ds1000_Scipy_accuracy: 25 ds1000_Scipy_accuracy: 18.75
ds1000_Sklearn_accuracy: 18.75 ds1000_Sklearn_accuracy: 18.75
ds1000_Pytorch_accuracy: 6.25 ds1000_Pytorch_accuracy: 12.50
ds1000_Matplotlib_accuracy: 50.00 ds1000_Matplotlib_accuracy: 43.75
openai_mmmlu_lite_AR-XY_accuracy: 37.5 openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_naive_average: 12.50 college_naive_average: 12.50
college_knowledge_naive_average: 87.5 college_knowledge_naive_average: 87.5
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
drop_accuracy: 62.5 drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5 GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75 hellaswag_accuracy: 93.75
TheoremQA_score: 12.50 TheoremQA_score: 18.75
winogrande_accuracy: 75 winogrande_accuracy: 75
gsm8k_accuracy: 37.5 gsm8k_accuracy: 37.5
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -188,23 +188,23 @@ internlm2_5-7b-turbomind_fullbench:
triviaqa_wiki_1shot_score: 43.75 triviaqa_wiki_1shot_score: 43.75
nq_open_1shot_score: 43.75 nq_open_1shot_score: 43.75
drop_accuracy: 62.5 drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5 GPQA_diamond_accuracy: 68.75
hellaswag_accuracy: 93.75 hellaswag_accuracy: 93.75
TheoremQA_score: 12.50 TheoremQA_score: 18.75
winogrande_accuracy: 87.5 winogrande_accuracy: 87.5
gsm8k_accuracy: 56.25 gsm8k_accuracy: 62.50
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75 GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
math_accuracy: 18.75 math_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 25 wikibench-wiki-single_choice_cncircular_perf_4: 0.00
sanitized_mbpp_score: 62.50 sanitized_mbpp_score: 62.50
dingo_en_192_score: 50.00 dingo_en_192_score: 37.50
dingo_zh_170_score: 93.75 dingo_zh_170_score: 100.00
mmlu-other_accuracy: 76.92 mmlu-other_accuracy: 78.37
cmmlu-china-specific_accuracy: 84.17 cmmlu-china-specific_accuracy: 83.33
mmlu_pro_math_accuracy: 18.75 mmlu_pro_math_accuracy: 18.75
bbh-logical_deduction_seven_objects_score: 43.75 bbh-logical_deduction_seven_objects_score: 62.50
bbh-multistep_arithmetic_two_score: 56.25 bbh-multistep_arithmetic_two_score: 50.00
college_naive_average: 12.5 college_naive_average: 12.5
college_knowledge_naive_average: 87.5 college_knowledge_naive_average: 87.5
@ -230,7 +230,7 @@ internlm2_5-7b-turbomind:
mmlu_naive_average: 71.44 mmlu_naive_average: 71.44
mmlu_pro_naive_average: 38.18 mmlu_pro_naive_average: 38.18
openai_humaneval_humaneval_pass@1: 59.76 openai_humaneval_humaneval_pass@1: 59.76
openai_humaneval_v2_humaneval_pass@1: 51.22 openai_humaneval_v2_humaneval_pass@1: 57.93
sanitized_mbpp_score: 55.25 sanitized_mbpp_score: 55.25
dingo_en_192_score: 60.94 dingo_en_192_score: 60.94
dingo_zh_170_score: 67.65 dingo_zh_170_score: 67.65
@ -257,17 +257,17 @@ internlm2_5-7b-turbomind:
mmlu_pro_physics_accuracy: 26.02 mmlu_pro_physics_accuracy: 26.02
mmlu_pro_psychology_accuracy: 52.76 mmlu_pro_psychology_accuracy: 52.76
mmlu_pro_other_accuracy: 42.21 mmlu_pro_other_accuracy: 42.21
college_naive_average: 10.67 college_naive_average: 7.00
high_naive_average: 6.67 high_naive_average: 6.67
middle_naive_average: 26.67 middle_naive_average: 26.67
primary_naive_average: 60 primary_naive_average: 64.00
arithmetic_naive_average: 55 arithmetic_naive_average: 55
mathbench-a (average)_naive_average: 31.8 mathbench-a (average)_naive_average: 31.8
college_knowledge_naive_average: 62.34 college_knowledge_naive_average: 58.23
high_knowledge_naive_average: 59.83 high_knowledge_naive_average: 52.51
middle_knowledge_naive_average: 71.15 middle_knowledge_naive_average: 71.15
primary_knowledge_naive_average: 66.55 primary_knowledge_naive_average: 60.48
mathbench-t (average)_naive_average: 64.97 mathbench-t (average)_naive_average: 60.19
long_context: long_context:
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
Single-Needle-Retrieval-EN-32000_naive_average: 100 Single-Needle-Retrieval-EN-32000_naive_average: 100
@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_koala: 28.21 alpaca_eval_koala: 28.21
alpaca_eval_oasst: 23.4 alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95 alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 33.75 alpaca_eval_vicuna: 25.00
compassarena_language_naive_average: 58.50 compassarena_language_naive_average: 53.00
compassarena_knowledge_naive_average: 36 compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35 compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 25.95 compassarena_math_v2_naive_average: 16.07
compassarena_creationv2_zh_naive_average: 43.64 compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35 fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41 fofo_test_prompts_cn_overall: 0.41

View File

@ -12,10 +12,10 @@ chat:
gsm8k_accuracy: 46.88 gsm8k_accuracy: 46.88
race-high_accuracy: 81.25 race-high_accuracy: 81.25
deepseek-r1-distill-llama-8b-turbomind: deepseek-r1-distill-llama-8b-turbomind:
gsm8k_accuracy: 31.25 gsm8k_accuracy: 34.38
race-high_accuracy: 81.25 race-high_accuracy: 81.25
deepseek-r1-distill-qwen-1_5b-turbomind: deepseek-r1-distill-qwen-1_5b-turbomind:
gsm8k_accuracy: 37.5 gsm8k_accuracy: 28.12
race-high_accuracy: 53.12 race-high_accuracy: 53.12
deepseek-7b-chat-vllm: deepseek-7b-chat-vllm:
gsm8k_accuracy: 43.75 gsm8k_accuracy: 43.75

View File

@ -17,7 +17,7 @@ on:
required: false required: false
description: 'whether to build lmdeploy' description: 'whether to build lmdeploy'
type: boolean type: boolean
default: true default: false
repo_org_lmdeploy: repo_org_lmdeploy:
required: false required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy' description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -146,7 +146,7 @@ jobs:
- name: Prepare - create conda env and install torch - cu12 - name: Prepare - create conda env and install torch - cu12
uses: nick-fields/retry@v3 uses: nick-fields/retry@v3
with: with:
max_attempts: 1 max_attempts: 3
timeout_minutes: 120 timeout_minutes: 120
command: | command: |
. ${{env.CONDA_PATH}}/bin/activate . ${{env.CONDA_PATH}}/bin/activate
@ -182,7 +182,7 @@ jobs:
pip list pip list
daily_run_test_volc: daily_run_test_volc:
if: ${{!cancelled()}} if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
needs: prepare_env needs: prepare_env
strategy: strategy:
fail-fast: false fail-fast: false
@ -222,7 +222,7 @@ jobs:
daily_run_test_local: daily_run_test_local:
if: ${{!cancelled()}} if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
needs: prepare_env needs: prepare_env
strategy: strategy:
fail-fast: false fail-fast: false
@ -303,7 +303,7 @@ jobs:
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
fullbench_run_test: fullbench_run_test:
if: ${{!cancelled()}} if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
needs: prepare_env needs: prepare_env
strategy: strategy:
fail-fast: false fail-fast: false