OpenCompass/.github/scripts/oc_score_baseline_fullbench.yaml
zhulinJulia24 ed81f9df30
[CI] update torch version and add more datasets into daily testcase (#1701)
* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2024-11-21 10:37:33 +08:00

174 lines
4.4 KiB
YAML

internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75
ARC-c: 93.75
BoolQ: 81.25
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 81.25
GPQA_diamond: 25
hellaswag: 87.5
TheoremQA: 18.75
musr_average: 39.58
gsm8k: 56.25
math: 75
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 50
sanitized_mbpp: 68.75
ds1000: 16.96
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 18.75
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 72.6
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 12.5
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 12.5
ds1000_Matplotlib: 43.75
openai_mmmlu_lite_AR-XY: 37.5
college: 12.5
college_knowledge: 87.5
Alignbench总分: 0.65
Alignbench专业能力: 7.83
AlpacaEvaltotal: 0
AlpacaEvalhelpful_base: 0
CompassArenacompassarena_language: 60
CompassArenacompassarena_knowledge: 56
CompassArenacompassarena_reason_v2: 50
CompassArenacompassarena_math_v2: 53.5
CompassArenacompassarena_creationv2_zh: 48.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -3.3333333333333335
internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75
ARC-c: 87.5
BoolQ: 68.75
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 75
hellaswag: 81.25
TheoremQA: 6.25
musr_average: 39.58
gsm8k: 68.75
math: 75
GPQA_diamond: 25
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 68.75
ds1000: 13.39
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 12.5
bbh-logical_deduction_seven_objects: 56.25
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 74.04
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 0
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 6.25
ds1000_Matplotlib: 37.5
openai_mmmlu_lite_AR-XY: 37.5
college: 0
college_knowledge: 87.5
Alignbench总分: 0.64
Alignbench专业能力: 7.6
AlpacaEvaltotal: 10
AlpacaEvalhelpful_base: 10
CompassArenacompassarena_language: 59
CompassArenacompassarena_knowledge: 57
CompassArenacompassarena_reason_v2: 49.5
CompassArenacompassarena_math_v2: 51
CompassArenacompassarena_creationv2_zh: 43.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -8.333333333333334
internlm2_5-7b-hf_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 12.5
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 37.5
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 75
hellaswag: 93.75
TheoremQA: 25
dingo_en_192: 37.5
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 43.75
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75
internlm2_5-7b-turbomind_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 18.75
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 68.75
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 87.5
hellaswag: 93.75
TheoremQA: 31.25
dingo_en_192: 43.75
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75