mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
e3c2521df5
commit
ba99868c77
34
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
34
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -78,38 +78,38 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
internlm2_5-7b-chat-turbomind_fullbench:
|
internlm2_5-7b-chat-turbomind_fullbench:
|
||||||
objective:
|
objective:
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
ARC-c_accuracy: 93.75
|
ARC-c_accuracy: 87.50
|
||||||
BoolQ_accuracy: 68.75
|
BoolQ_accuracy: 68.75
|
||||||
triviaqa_wiki_1shot_score: 50
|
triviaqa_wiki_1shot_score: 50
|
||||||
nq_open_1shot_score: 25
|
nq_open_1shot_score: 25
|
||||||
IFEval_Prompt-level-strict-accuracy: 56.25
|
IFEval_Prompt-level-strict-accuracy: 56.25
|
||||||
drop_accuracy: 81.25
|
drop_accuracy: 75
|
||||||
GPQA_diamond_accuracy: 31.25
|
GPQA_diamond_accuracy: 31.25
|
||||||
hellaswag_accuracy: 81.25
|
hellaswag_accuracy: 87.5
|
||||||
TheoremQA_score: 6.25
|
TheoremQA_score: 12.5
|
||||||
musr_average_naive_average: 39.58
|
musr_average_naive_average: 39.58
|
||||||
korbench_single_naive_average: 37.50
|
korbench_single_naive_average: 40
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 62.5
|
||||||
math_accuracy: 68.75
|
math_accuracy: 75
|
||||||
cmo_fib_accuracy: 6.25
|
cmo_fib_accuracy: 6.25
|
||||||
aime2024_accuracy: 6.25
|
aime2024_accuracy: 6.25
|
||||||
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
|
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||||
sanitized_mbpp_score: 68.75
|
sanitized_mbpp_score: 68.75
|
||||||
ds1000_naive_average: 16.96
|
ds1000_naive_average: 17.86
|
||||||
lcb_code_generation_pass@1: 12.5
|
lcb_code_generation_pass@1: 12.5
|
||||||
lcb_code_execution_pass@1: 43.75
|
lcb_code_execution_pass@1: 43.75
|
||||||
lcb_test_output_pass@1: 25.00
|
lcb_test_output_pass@1: 18.75
|
||||||
bbh-logical_deduction_seven_objects_score: 50.00
|
bbh-logical_deduction_seven_objects_score: 56.25
|
||||||
bbh-multistep_arithmetic_two_score: 68.75
|
bbh-multistep_arithmetic_two_score: 75
|
||||||
mmlu-other_accuracy: 69.71
|
mmlu-other_accuracy: 72.6
|
||||||
cmmlu-china-specific_accuracy: 75.83
|
cmmlu-china-specific_accuracy: 78.33
|
||||||
mmlu_pro_math_accuracy: 31.25
|
mmlu_pro_math_accuracy: 31.25
|
||||||
ds1000_Pandas_accuracy: 0
|
ds1000_Pandas_accuracy: 12.5
|
||||||
ds1000_Numpy_accuracy: 0
|
ds1000_Numpy_accuracy: 0
|
||||||
ds1000_Tensorflow_accuracy: 12.5
|
ds1000_Tensorflow_accuracy: 12.5
|
||||||
ds1000_Scipy_accuracy: 18.75
|
ds1000_Scipy_accuracy: 25
|
||||||
ds1000_Sklearn_accuracy: 18.75
|
ds1000_Sklearn_accuracy: 18.75
|
||||||
ds1000_Pytorch_accuracy: 18.75
|
ds1000_Pytorch_accuracy: 6.25
|
||||||
ds1000_Matplotlib_accuracy: 50.00
|
ds1000_Matplotlib_accuracy: 50.00
|
||||||
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||||
college_naive_average: 12.50
|
college_naive_average: 12.50
|
||||||
|
Loading…
Reference in New Issue
Block a user