mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
c269cc054d
commit
3472ed113d
@ -58,7 +58,7 @@ for m in models:
|
||||
|
||||
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||
|
||||
judge_models = deepcopy([hf_internlm2_5_7b_chat_model])
|
||||
judge_models = deepcopy(hf_internlm2_5_7b_chat_model)
|
||||
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
|
||||
|
||||
eval = dict(
|
||||
|
24
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
24
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -309,7 +309,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
GaokaoBench_weighted_average: 78.6
|
||||
math_accuracy: 61
|
||||
cmo_fib_accuracy: 11
|
||||
aime2024_accuracy: 6.67
|
||||
aime2024_accuracy: 3.33
|
||||
Mathbench_naive_average: 64.23
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.32
|
||||
cmmlu_naive_average: 74.3
|
||||
@ -322,7 +322,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
lcb_code_generation_pass@1: 17.75
|
||||
lcb_code_execution_pass@1: 32.57
|
||||
lcb_test_output_pass@1: 26.13
|
||||
bigcodebench_hard_instruct_pass@1: 8.45
|
||||
bigcodebench_hard_instruct_pass@1: 3.38
|
||||
bigcodebench_hard_complete_pass@1: 5.06
|
||||
teval_naive_average: 80
|
||||
SciCode_sub_accuracy: 5.56
|
||||
@ -384,7 +384,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
college_knowledge_naive_average: 67.1
|
||||
high_knowledge_naive_average: 70
|
||||
middle_knowledge_naive_average: 80
|
||||
primary_knowledge_naive_average: 87
|
||||
primary_knowledge_naive_average: 90.12
|
||||
mathbench-t (average)_naive_average: 76
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 5.68
|
||||
@ -524,7 +524,7 @@ qwen2.5-7b-instruct-turbomind:
|
||||
humanevalx-python_pass@1: 50
|
||||
humanevalx-cpp_pass@1: 42.07
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 74.39
|
||||
humanevalx-java_pass@1: 53.05
|
||||
humanevalx-js_pass@1: 75
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 8.18
|
||||
@ -548,7 +548,7 @@ qwen2.5-7b-instruct-turbomind:
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||
college_naive_average: 48
|
||||
college_naive_average: 44.33
|
||||
high_naive_average: 59
|
||||
middle_naive_average: 78
|
||||
primary_naive_average: 85.67
|
||||
@ -658,7 +658,7 @@ internlm2_5-7b-chat-pytorch:
|
||||
college_naive_average: 21
|
||||
high_naive_average: 47
|
||||
middle_naive_average: 59.67
|
||||
primary_naive_average: 76
|
||||
primary_naive_average: 72.33
|
||||
arithmetic_naive_average: 62
|
||||
mathbench-a (average)_naive_average: 53.13
|
||||
college_knowledge_naive_average: 68.99
|
||||
@ -688,7 +688,7 @@ qwen2.5-7b-instruct-pytorch:
|
||||
gsm8k_accuracy: 91.66
|
||||
GaokaoBench_weighted_average: 80.02
|
||||
math_accuracy: 73.74
|
||||
cmo_fib_accuracy: 26.44
|
||||
cmo_fib_accuracy: 22.60
|
||||
aime2024_accuracy: 13.33
|
||||
Mathbench_naive_average: 77.08
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||
@ -793,8 +793,8 @@ internlm3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 91.28
|
||||
GaokaoBench_weighted_average: 86.59
|
||||
math_accuracy: 76.96
|
||||
cmo_fib_accuracy: 35.1
|
||||
aime2024_accuracy: 16.67
|
||||
cmo_fib_accuracy: 38.46
|
||||
aime2024_accuracy: 13.33
|
||||
Mathbench_naive_average: 78.96
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||
cmmlu_naive_average: 83.33
|
||||
@ -841,7 +841,7 @@ internlm3-8b-instruct-turbomind:
|
||||
humanevalx-python_pass@1: 43.9
|
||||
humanevalx-cpp_pass@1: 20.12
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 74.39
|
||||
humanevalx-java_pass@1: 40.85
|
||||
humanevalx-js_pass@1: 65.24
|
||||
ds1000_Pandas_accuracy: 16.49
|
||||
ds1000_Numpy_accuracy: 34.09
|
||||
@ -907,7 +907,7 @@ internlm3-8b-instruct-pytorch:
|
||||
mmlu_pro_naive_average: 58.16
|
||||
openai_humaneval_humaneval_pass@1: 82.32
|
||||
sanitized_mbpp_score: 70.04
|
||||
humanevalx_naive_average: 39.76
|
||||
humanevalx_naive_average: 25.49
|
||||
ds1000_naive_average: 27.84
|
||||
lcb_code_generation_pass@1: 34.5
|
||||
lcb_code_execution_pass@1: 48.02
|
||||
@ -946,7 +946,7 @@ internlm3-8b-instruct-pytorch:
|
||||
humanevalx-python_pass@1: 42.68
|
||||
humanevalx-cpp_pass@1: 19.51
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 72.56
|
||||
humanevalx-java_pass@1: 0.00
|
||||
humanevalx-js_pass@1: 64.02
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 35
|
||||
|
Loading…
Reference in New Issue
Block a user