C-Eval
python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
Base Models
model |
ceval-test |
ceval-test-hard |
ceval-test-stem |
ceval-test-social-science |
ceval-test-humanities |
ceval-test-other |
ceval-dev |
ceval-dev-hard |
ceval-dev-stem |
ceval-dev-social-science |
ceval-dev-humanities |
ceval-dev-other |
llama-7b-turbomind |
26.61 |
27.75 |
27.20 |
26.31 |
25.90 |
26.52 |
27.44 |
27.68 |
27.16 |
29.49 |
24.18 |
29.36 |
llama-13b-turbomind |
29.18 |
25.59 |
27.66 |
33.86 |
28.29 |
28.58 |
31.75 |
30.32 |
31.39 |
35.22 |
30.16 |
30.82 |
llama-30b-turbomind |
35.09 |
31.68 |
34.56 |
39.89 |
33.02 |
33.76 |
37.70 |
31.97 |
34.80 |
42.72 |
41.19 |
34.93 |
llama-65b-turbomind |
37.98 |
29.47 |
36.03 |
45.03 |
36.51 |
36.56 |
40.46 |
33.76 |
36.37 |
46.47 |
42.26 |
40.63 |
llama-2-7b-turbomind |
30.13 |
26.26 |
29.29 |
33.02 |
31.02 |
28.15 |
32.70 |
25.85 |
28.75 |
39.75 |
37.04 |
29.13 |
llama-2-13b-turbomind |
37.38 |
30.81 |
35.85 |
43.98 |
36.81 |
34.75 |
40.43 |
31.34 |
35.67 |
45.75 |
45.32 |
39.36 |
llama-2-70b-turbomind |
49.53 |
33.48 |
44.73 |
60.19 |
50.93 |
47.17 |
50.26 |
32.53 |
44.83 |
59.44 |
54.45 |
47.58 |
llama-3-8b-turbomind |
48.83 |
34.47 |
46.02 |
56.48 |
49.15 |
46.69 |
50.45 |
33.76 |
45.94 |
58.08 |
50.93 |
51.25 |
llama-3-70b-turbomind |
66.56 |
54.09 |
64.08 |
76.43 |
64.38 |
64.25 |
67.30 |
52.35 |
62.67 |
77.89 |
69.76 |
63.65 |
internlm2-1.8b-turbomind |
44.79 |
33.93 |
41.19 |
54.26 |
47.15 |
40.35 |
46.64 |
33.00 |
38.62 |
57.28 |
51.30 |
46.89 |
internlm2-7b-turbomind |
63.54 |
45.32 |
58.10 |
76.40 |
66.94 |
58.32 |
64.23 |
40.09 |
54.37 |
76.88 |
70.11 |
64.77 |
internlm2-20b-turbomind |
67.28 |
50.15 |
62.33 |
79.59 |
70.55 |
61.82 |
66.73 |
42.50 |
59.25 |
79.98 |
73.43 |
61.56 |
qwen-1.8b-turbomind |
54.24 |
38.60 |
50.02 |
68.18 |
55.33 |
48.13 |
53.78 |
33.38 |
46.36 |
68.40 |
57.57 |
50.17 |
qwen-7b-turbomind |
62.06 |
42.73 |
56.21 |
77.12 |
65.28 |
55.76 |
63.23 |
36.99 |
54.74 |
78.55 |
68.94 |
59.02 |
qwen-14b-turbomind |
70.33 |
53.61 |
65.25 |
83.19 |
72.85 |
65.37 |
72.05 |
55.03 |
66.07 |
85.59 |
74.91 |
67.78 |
qwen-72b-turbomind |
83.25 |
66.78 |
78.44 |
91.75 |
83.86 |
83.63 |
83.60 |
63.68 |
78.05 |
90.25 |
87.13 |
84.13 |
qwen1.5-0.5b-hf |
48.36 |
35.55 |
44.72 |
62.00 |
48.51 |
42.41 |
50.43 |
37.00 |
46.28 |
62.64 |
48.11 |
49.18 |
qwen1.5-1.8b-hf |
58.67 |
40.98 |
53.91 |
74.52 |
58.51 |
53.06 |
59.38 |
43.02 |
53.45 |
75.88 |
60.06 |
54.47 |
qwen1.5-4b-hf |
66.55 |
48.50 |
61.45 |
81.12 |
67.90 |
61.22 |
66.46 |
43.12 |
56.76 |
82.89 |
67.61 |
68.03 |
qwen1.5-7b-hf |
72.49 |
52.90 |
66.77 |
85.50 |
74.37 |
69.19 |
73.57 |
49.16 |
66.32 |
84.23 |
77.30 |
73.34 |
qwen1.5-14b-hf |
76.93 |
60.50 |
72.08 |
88.81 |
77.95 |
73.94 |
77.86 |
54.81 |
71.55 |
86.79 |
82.86 |
76.23 |
qwen1.5-32b-hf |
82.50 |
66.67 |
77.97 |
90.93 |
83.66 |
81.88 |
82.79 |
71.06 |
80.01 |
89.02 |
83.36 |
81.62 |
qwen1.5-72b-hf |
83.03 |
65.09 |
77.90 |
91.47 |
83.85 |
83.86 |
83.72 |
64.09 |
77.26 |
91.87 |
87.64 |
84.14 |
qwen1.5-moe-a2-7b-hf |
76.67 |
51.37 |
68.89 |
88.33 |
77.15 |
79.73 |
77.90 |
51.25 |
67.27 |
89.28 |
83.16 |
81.60 |
mistral-7b-v0.1-hf |
43.76 |
33.85 |
42.23 |
49.97 |
41.10 |
43.54 |
47.54 |
33.97 |
44.74 |
54.80 |
51.52 |
42.06 |
mistral-7b-v0.2-hf |
42.81 |
32.84 |
41.00 |
50.19 |
39.45 |
42.77 |
46.44 |
31.67 |
42.89 |
54.50 |
48.75 |
43.23 |
mixtral-8x7b-v0.1-hf |
51.15 |
41.46 |
50.93 |
59.19 |
46.69 |
48.72 |
55.31 |
42.04 |
52.78 |
62.00 |
56.44 |
52.71 |
mixtral-8x22b-v0.1-hf |
58.13 |
48.31 |
58.01 |
66.94 |
53.60 |
54.86 |
60.50 |
45.67 |
57.44 |
71.27 |
61.31 |
55.47 |
yi-6b-hf |
70.78 |
43.72 |
60.54 |
83.29 |
75.39 |
73.40 |
73.13 |
46.87 |
63.14 |
85.52 |
78.70 |
74.45 |
yi-34b-hf |
80.93 |
58.51 |
73.48 |
89.24 |
83.65 |
84.18 |
81.62 |
56.95 |
71.64 |
89.73 |
87.49 |
86.53 |
deepseek-7b-base-hf |
43.68 |
28.90 |
37.03 |
53.55 |
50.14 |
40.34 |
45.07 |
31.94 |
38.81 |
56.68 |
47.10 |
43.85 |
deepseek-67b-base-hf |
66.66 |
44.25 |
57.89 |
79.02 |
72.36 |
65.66 |
66.65 |
38.62 |
56.65 |
79.56 |
73.72 |
66.01 |
Details on Test Split
model |
computer_network |
operating_system |
computer_architecture |
college_programming |
college_physics |
college_chemistry |
advanced_mathematics |
probability_and_statistics |
discrete_mathematics |
electrical_engineer |
metrology_engineer |
high_school_mathematics |
llama-7b-turbomind |
29.82 |
25.70 |
26.94 |
30.99 |
32.95 |
23.66 |
26.01 |
22.89 |
27.45 |
30.09 |
26.48 |
33.13 |
llama-13b-turbomind |
33.33 |
37.99 |
31.09 |
29.82 |
22.16 |
27.23 |
31.79 |
27.11 |
24.84 |
28.02 |
33.33 |
30.72 |
llama-30b-turbomind |
40.94 |
48.60 |
40.41 |
34.21 |
32.95 |
35.71 |
36.42 |
32.53 |
27.45 |
31.56 |
36.07 |
30.12 |
llama-65b-turbomind |
41.52 |
50.84 |
44.04 |
40.94 |
27.84 |
29.46 |
28.32 |
30.72 |
29.41 |
35.10 |
42.47 |
30.12 |
llama-2-7b-turbomind |
33.92 |
37.99 |
34.72 |
30.99 |
26.70 |
21.88 |
31.79 |
25.30 |
24.18 |
31.56 |
39.73 |
30.12 |
llama-2-13b-turbomind |
40.94 |
46.93 |
37.82 |
36.26 |
30.68 |
29.46 |
35.84 |
30.72 |
24.84 |
32.74 |
42.92 |
34.94 |
llama-2-70b-turbomind |
55.56 |
58.66 |
53.89 |
47.95 |
34.09 |
33.48 |
32.95 |
27.11 |
34.64 |
37.76 |
57.99 |
29.52 |
llama-3-8b-turbomind |
55.56 |
58.66 |
55.96 |
51.17 |
27.27 |
35.27 |
36.42 |
31.33 |
34.64 |
40.12 |
50.68 |
30.72 |
llama-3-70b-turbomind |
69.59 |
75.98 |
69.95 |
71.64 |
49.43 |
58.04 |
52.02 |
53.01 |
58.82 |
45.72 |
68.95 |
40.96 |
internlm2-1.8b-turbomind |
40.35 |
40.78 |
39.38 |
32.16 |
34.66 |
34.38 |
31.21 |
31.33 |
35.95 |
35.10 |
51.60 |
27.71 |
internlm2-7b-turbomind |
56.14 |
57.54 |
62.69 |
49.42 |
43.75 |
48.21 |
34.68 |
32.53 |
33.33 |
41.00 |
60.27 |
40.36 |
internlm2-20b-turbomind |
62.57 |
65.36 |
66.84 |
58.77 |
43.18 |
51.79 |
39.31 |
40.36 |
35.95 |
42.77 |
66.67 |
47.59 |
qwen-1.8b-turbomind |
46.20 |
41.90 |
46.63 |
36.84 |
40.34 |
36.61 |
27.75 |
28.92 |
32.68 |
36.58 |
57.08 |
30.12 |
qwen-7b-turbomind |
52.63 |
54.75 |
54.40 |
46.20 |
35.80 |
44.20 |
36.99 |
27.71 |
26.80 |
38.35 |
57.99 |
33.13 |
qwen-14b-turbomind |
58.48 |
64.80 |
59.07 |
54.68 |
45.45 |
57.59 |
45.09 |
33.73 |
39.22 |
49.26 |
67.58 |
45.78 |
qwen-72b-turbomind |
83.04 |
73.74 |
79.27 |
76.61 |
75.00 |
64.29 |
49.13 |
44.58 |
46.41 |
66.37 |
85.84 |
68.07 |
qwen1.5-0.5b-hf |
37.43 |
40.22 |
41.45 |
35.09 |
40.91 |
34.82 |
30.06 |
27.11 |
26.80 |
29.79 |
54.34 |
31.93 |
qwen1.5-1.8b-hf |
47.37 |
50.84 |
47.67 |
38.30 |
43.18 |
35.27 |
29.48 |
30.12 |
33.99 |
39.53 |
58.90 |
28.92 |
qwen1.5-4b-hf |
62.57 |
56.98 |
56.99 |
46.78 |
48.30 |
45.98 |
40.46 |
34.34 |
31.37 |
46.61 |
62.10 |
43.37 |
qwen1.5-7b-hf |
66.08 |
62.57 |
66.32 |
55.56 |
54.55 |
47.77 |
41.62 |
31.93 |
35.95 |
49.85 |
74.43 |
49.40 |
qwen1.5-14b-hf |
71.35 |
66.48 |
68.39 |
64.91 |
57.95 |
65.62 |
41.62 |
40.36 |
47.71 |
56.64 |
79.45 |
56.63 |
qwen1.5-32b-hf |
84.80 |
73.18 |
74.61 |
70.18 |
71.59 |
61.61 |
49.13 |
45.78 |
49.02 |
61.95 |
87.67 |
72.89 |
qwen1.5-72b-hf |
85.38 |
73.74 |
78.24 |
78.36 |
72.73 |
63.39 |
43.35 |
40.96 |
49.02 |
65.78 |
85.84 |
66.27 |
qwen1.5-moe-a2-7b-hf |
77.78 |
73.74 |
68.91 |
64.91 |
66.48 |
49.11 |
33.53 |
36.75 |
35.95 |
61.06 |
91.32 |
40.96 |
mistral-7b-v0.1-hf |
55.56 |
55.31 |
56.99 |
48.25 |
39.77 |
39.29 |
33.53 |
25.90 |
31.37 |
35.99 |
45.21 |
27.11 |
mistral-7b-v0.2-hf |
56.14 |
53.63 |
55.44 |
47.66 |
36.36 |
34.38 |
32.37 |
25.30 |
33.33 |
31.86 |
45.21 |
29.52 |
mixtral-8x7b-v0.1-hf |
62.57 |
64.80 |
60.10 |
60.53 |
38.64 |
42.41 |
40.46 |
37.35 |
45.75 |
35.99 |
60.27 |
34.94 |
mixtral-8x22b-v0.1-hf |
65.50 |
74.86 |
63.73 |
65.79 |
46.59 |
52.68 |
52.02 |
45.78 |
52.94 |
42.77 |
62.56 |
39.16 |
yi-6b-hf |
68.42 |
63.13 |
69.43 |
57.89 |
42.05 |
48.66 |
31.79 |
33.13 |
28.76 |
49.85 |
74.89 |
37.35 |
yi-34b-hf |
83.63 |
80.45 |
74.09 |
68.42 |
62.50 |
60.27 |
45.09 |
38.55 |
50.33 |
65.19 |
88.58 |
49.40 |
deepseek-7b-base-hf |
44.44 |
44.13 |
44.56 |
36.26 |
30.68 |
29.02 |
32.37 |
24.70 |
26.14 |
35.99 |
48.86 |
28.31 |
deepseek-67b-base-hf |
63.16 |
70.39 |
65.80 |
59.36 |
42.61 |
45.54 |
35.84 |
38.55 |
42.48 |
44.54 |
68.95 |
33.73 |
model |
high_school_physics |
high_school_chemistry |
high_school_biology |
middle_school_mathematics |
middle_school_biology |
middle_school_physics |
middle_school_chemistry |
veterinary_medicine |
college_economics |
business_administration |
marxism |
mao_zedong_thought |
llama-7b-turbomind |
29.14 |
26.74 |
24.57 |
29.94 |
22.92 |
23.60 |
20.00 |
30.95 |
29.98 |
24.58 |
25.70 |
25.11 |
llama-13b-turbomind |
22.29 |
18.60 |
28.00 |
26.55 |
26.56 |
25.28 |
19.46 |
29.05 |
28.77 |
28.57 |
39.66 |
43.38 |
llama-30b-turbomind |
25.14 |
33.14 |
36.00 |
31.07 |
39.06 |
28.09 |
33.51 |
38.10 |
35.21 |
35.88 |
48.04 |
33.33 |
llama-65b-turbomind |
33.71 |
26.16 |
38.29 |
33.90 |
44.27 |
36.52 |
38.92 |
38.10 |
37.42 |
42.19 |
59.22 |
48.40 |
llama-2-7b-turbomind |
26.86 |
23.26 |
26.86 |
28.81 |
28.12 |
29.78 |
22.70 |
30.48 |
31.79 |
30.56 |
33.52 |
36.07 |
llama-2-13b-turbomind |
28.00 |
31.98 |
36.57 |
36.72 |
38.54 |
36.52 |
37.84 |
46.67 |
37.02 |
36.54 |
57.54 |
41.10 |
llama-2-70b-turbomind |
40.00 |
36.05 |
48.00 |
36.72 |
66.67 |
55.06 |
55.68 |
52.86 |
51.91 |
48.50 |
68.16 |
60.73 |
llama-3-8b-turbomind |
41.71 |
38.37 |
50.86 |
36.16 |
61.98 |
63.48 |
63.78 |
56.19 |
41.65 |
49.17 |
69.27 |
54.34 |
llama-3-70b-turbomind |
63.43 |
56.98 |
69.14 |
59.32 |
84.90 |
75.28 |
78.92 |
79.52 |
68.81 |
59.80 |
86.59 |
79.91 |
internlm2-1.8b-turbomind |
30.29 |
45.93 |
46.29 |
33.33 |
63.02 |
60.11 |
62.70 |
47.62 |
35.61 |
37.87 |
69.27 |
61.64 |
internlm2-7b-turbomind |
64.57 |
65.12 |
76.00 |
54.80 |
91.15 |
85.96 |
90.27 |
74.29 |
57.34 |
50.50 |
86.59 |
83.56 |
internlm2-20b-turbomind |
68.57 |
74.42 |
78.86 |
58.76 |
91.67 |
90.45 |
90.27 |
72.38 |
57.95 |
55.81 |
88.83 |
88.58 |
qwen-1.8b-turbomind |
55.43 |
56.98 |
61.14 |
54.80 |
85.42 |
84.83 |
85.41 |
54.76 |
43.06 |
44.19 |
83.80 |
79.91 |
qwen-7b-turbomind |
68.00 |
69.19 |
82.86 |
57.63 |
93.75 |
87.64 |
92.43 |
63.81 |
47.28 |
57.48 |
86.59 |
82.65 |
qwen-14b-turbomind |
78.86 |
83.14 |
92.57 |
67.23 |
96.88 |
95.51 |
96.76 |
73.33 |
56.94 |
64.45 |
91.62 |
86.76 |
qwen-72b-turbomind |
93.14 |
93.60 |
95.43 |
88.70 |
98.44 |
97.75 |
99.46 |
90.00 |
75.45 |
80.73 |
96.09 |
99.54 |
qwen1.5-0.5b-hf |
48.57 |
44.19 |
60.00 |
40.68 |
73.44 |
69.66 |
78.92 |
49.05 |
34.41 |
40.20 |
79.89 |
74.43 |
qwen1.5-1.8b-hf |
58.86 |
68.02 |
76.00 |
59.32 |
91.15 |
90.45 |
87.03 |
63.81 |
44.87 |
48.50 |
86.03 |
90.41 |
qwen1.5-4b-hf |
66.86 |
77.33 |
82.86 |
68.93 |
95.31 |
92.70 |
97.30 |
71.90 |
51.31 |
61.13 |
91.62 |
94.52 |
qwen1.5-7b-hf |
79.43 |
82.56 |
91.43 |
77.40 |
96.88 |
95.51 |
96.22 |
80.00 |
62.37 |
69.77 |
93.30 |
97.26 |
qwen1.5-14b-hf |
86.29 |
87.79 |
93.14 |
83.05 |
97.92 |
95.51 |
97.84 |
82.86 |
63.78 |
77.08 |
95.53 |
96.35 |
qwen1.5-32b-hf |
88.00 |
95.35 |
94.86 |
91.53 |
97.92 |
99.44 |
100.00 |
90.00 |
73.44 |
78.74 |
94.97 |
98.63 |
qwen1.5-72b-hf |
91.43 |
93.60 |
95.43 |
88.70 |
97.92 |
98.31 |
99.46 |
90.00 |
74.25 |
80.40 |
94.41 |
98.63 |
qwen1.5-moe-a2-7b-hf |
70.86 |
77.33 |
82.86 |
68.36 |
97.92 |
93.26 |
97.30 |
89.52 |
70.22 |
74.75 |
96.09 |
98.17 |
mistral-7b-v0.1-hf |
33.14 |
40.70 |
40.57 |
40.11 |
47.92 |
49.44 |
50.81 |
47.62 |
44.87 |
37.87 |
58.10 |
48.40 |
mistral-7b-v0.2-hf |
34.86 |
36.63 |
45.71 |
36.72 |
46.35 |
46.07 |
48.65 |
43.81 |
43.46 |
39.53 |
57.54 |
48.86 |
mixtral-8x7b-v0.1-hf |
49.71 |
42.44 |
53.71 |
47.46 |
62.50 |
61.24 |
60.00 |
57.62 |
52.52 |
44.52 |
68.72 |
57.99 |
mixtral-8x22b-v0.1-hf |
54.29 |
43.02 |
58.29 |
55.93 |
76.04 |
66.29 |
75.68 |
66.19 |
60.97 |
51.83 |
74.30 |
70.78 |
yi-6b-hf |
58.86 |
69.19 |
78.29 |
43.50 |
92.19 |
89.33 |
90.27 |
83.81 |
59.56 |
70.10 |
93.85 |
97.72 |
yi-34b-hf |
80.00 |
81.98 |
93.14 |
65.54 |
97.40 |
95.51 |
96.76 |
92.86 |
74.04 |
76.08 |
94.97 |
97.26 |
deepseek-7b-base-hf |
29.14 |
30.81 |
33.14 |
24.29 |
53.12 |
45.51 |
48.65 |
50.48 |
38.23 |
44.19 |
62.01 |
65.30 |
deepseek-67b-base-hf |
60.00 |
55.23 |
64.00 |
46.33 |
84.90 |
79.78 |
83.24 |
73.33 |
57.75 |
63.79 |
89.94 |
88.58 |
model |
education_science |
teacher_qualification |
high_school_politics |
high_school_geography |
middle_school_politics |
middle_school_geography |
modern_chinese_history |
ideological_and_moral_cultivation |
logic |
law |
chinese_language_and_literature |
art_studies |
llama-7b-turbomind |
22.96 |
31.58 |
25.57 |
29.78 |
22.80 |
25.00 |
21.70 |
21.51 |
25.00 |
26.24 |
22.49 |
25.84 |
llama-13b-turbomind |
29.26 |
30.83 |
33.52 |
36.52 |
34.72 |
33.33 |
24.06 |
40.12 |
26.47 |
33.48 |
30.14 |
29.87 |
llama-30b-turbomind |
37.41 |
46.37 |
32.95 |
38.20 |
50.78 |
40.74 |
28.77 |
45.93 |
33.33 |
32.13 |
39.23 |
22.82 |
llama-65b-turbomind |
39.63 |
51.13 |
31.82 |
39.89 |
58.03 |
42.59 |
34.91 |
55.23 |
39.71 |
30.32 |
37.80 |
32.89 |
llama-2-7b-turbomind |
27.78 |
34.34 |
31.82 |
34.83 |
35.23 |
34.26 |
28.77 |
38.95 |
32.35 |
33.94 |
27.27 |
30.87 |
llama-2-13b-turbomind |
41.48 |
47.37 |
37.50 |
37.64 |
50.78 |
52.78 |
43.40 |
48.84 |
32.35 |
38.46 |
36.36 |
30.20 |
llama-2-70b-turbomind |
57.78 |
69.17 |
50.57 |
58.43 |
69.95 |
66.67 |
50.94 |
72.09 |
50.98 |
42.53 |
44.98 |
52.01 |
llama-3-8b-turbomind |
56.30 |
65.41 |
47.16 |
56.18 |
64.25 |
61.11 |
55.66 |
67.44 |
41.67 |
40.27 |
45.45 |
50.34 |
llama-3-70b-turbomind |
72.22 |
85.46 |
75.00 |
74.72 |
84.97 |
76.85 |
75.00 |
76.16 |
59.31 |
52.94 |
62.68 |
68.46 |
internlm2-1.8b-turbomind |
47.41 |
61.40 |
55.11 |
47.75 |
61.66 |
64.81 |
61.79 |
63.95 |
32.35 |
32.58 |
48.33 |
36.58 |
internlm2-7b-turbomind |
66.67 |
85.96 |
78.98 |
74.72 |
91.71 |
87.96 |
80.66 |
80.23 |
42.16 |
50.23 |
64.11 |
70.13 |
internlm2-20b-turbomind |
69.26 |
89.22 |
83.52 |
80.34 |
90.67 |
91.67 |
83.02 |
85.47 |
49.02 |
54.30 |
72.25 |
73.15 |
qwen-1.8b-turbomind |
51.11 |
70.68 |
71.02 |
62.36 |
88.60 |
87.04 |
69.81 |
73.26 |
29.90 |
46.15 |
50.24 |
47.32 |
qwen-7b-turbomind |
57.41 |
83.71 |
88.64 |
79.78 |
93.26 |
94.44 |
75.47 |
79.07 |
42.16 |
47.96 |
59.33 |
65.10 |
qwen-14b-turbomind |
72.96 |
89.97 |
93.75 |
83.71 |
96.37 |
95.37 |
86.32 |
87.21 |
50.00 |
60.63 |
66.99 |
72.48 |
qwen-72b-turbomind |
85.56 |
96.24 |
95.45 |
93.26 |
97.93 |
97.22 |
92.45 |
91.86 |
67.65 |
76.92 |
75.12 |
83.89 |
qwen1.5-0.5b-hf |
43.33 |
63.16 |
65.91 |
56.18 |
82.90 |
79.63 |
68.87 |
70.35 |
28.43 |
37.56 |
39.23 |
32.21 |
qwen1.5-1.8b-hf |
57.41 |
76.44 |
81.25 |
75.84 |
92.75 |
91.67 |
79.72 |
81.98 |
34.31 |
47.96 |
47.85 |
43.62 |
qwen1.5-4b-hf |
65.93 |
87.47 |
86.93 |
82.58 |
94.30 |
95.37 |
84.91 |
84.30 |
40.20 |
62.90 |
58.85 |
58.72 |
qwen1.5-7b-hf |
69.26 |
91.98 |
90.91 |
89.89 |
95.85 |
94.44 |
89.15 |
87.21 |
48.04 |
67.87 |
63.16 |
68.12 |
qwen1.5-14b-hf |
78.89 |
94.99 |
94.89 |
91.57 |
96.89 |
98.15 |
91.04 |
88.37 |
57.84 |
69.68 |
66.99 |
73.83 |
qwen1.5-32b-hf |
83.70 |
95.99 |
93.75 |
94.38 |
98.45 |
97.22 |
90.57 |
91.28 |
70.10 |
76.92 |
76.56 |
80.87 |
qwen1.5-72b-hf |
84.44 |
96.49 |
96.59 |
93.82 |
98.45 |
97.22 |
92.92 |
91.28 |
66.67 |
76.92 |
74.16 |
85.23 |
qwen1.5-moe-a2-7b-hf |
80.74 |
95.49 |
89.20 |
89.33 |
94.82 |
94.44 |
92.45 |
91.28 |
52.45 |
75.57 |
67.94 |
79.87 |
mistral-7b-v0.1-hf |
45.19 |
59.15 |
43.75 |
49.44 |
56.48 |
56.48 |
45.28 |
58.14 |
37.75 |
38.91 |
40.67 |
34.56 |
mistral-7b-v0.2-hf |
45.93 |
58.65 |
38.07 |
48.31 |
63.21 |
58.33 |
41.98 |
54.07 |
35.78 |
40.27 |
38.28 |
32.21 |
mixtral-8x7b-v0.1-hf |
57.04 |
67.92 |
53.41 |
55.06 |
69.95 |
64.81 |
47.64 |
70.93 |
42.16 |
38.01 |
46.41 |
36.58 |
mixtral-8x22b-v0.1-hf |
60.37 |
72.68 |
64.77 |
65.17 |
77.20 |
71.30 |
57.08 |
75.00 |
49.51 |
43.44 |
52.63 |
49.33 |
yi-6b-hf |
79.26 |
92.48 |
77.27 |
76.40 |
92.75 |
93.52 |
89.15 |
90.12 |
60.78 |
74.66 |
61.24 |
74.16 |
yi-34b-hf |
84.81 |
96.24 |
88.07 |
88.20 |
96.37 |
96.30 |
91.98 |
91.28 |
75.00 |
78.73 |
80.38 |
82.89 |
deepseek-7b-base-hf |
52.22 |
70.18 |
47.16 |
51.12 |
60.62 |
44.44 |
58.49 |
66.86 |
31.86 |
37.56 |
53.11 |
61.07 |
deepseek-67b-base-hf |
76.67 |
89.22 |
77.27 |
78.65 |
89.64 |
78.70 |
85.85 |
84.30 |
50.00 |
64.25 |
69.38 |
84.23 |
model |
professional_tour_guide |
legal_professional |
high_school_chinese |
high_school_history |
middle_school_history |
civil_servant |
sports_science |
plant_protection |
basic_medicine |
clinical_medicine |
urban_and_rural_planner |
accountant |
llama-7b-turbomind |
29.70 |
23.72 |
27.53 |
30.22 |
30.92 |
27.04 |
22.78 |
28.64 |
28.00 |
25.00 |
26.32 |
29.80 |
llama-13b-turbomind |
25.94 |
20.93 |
25.84 |
29.67 |
24.64 |
29.60 |
26.67 |
29.15 |
33.71 |
25.50 |
28.47 |
28.44 |
llama-30b-turbomind |
29.32 |
27.91 |
30.34 |
36.26 |
37.20 |
36.13 |
36.11 |
38.69 |
34.29 |
29.50 |
38.52 |
29.35 |
llama-65b-turbomind |
28.95 |
30.70 |
30.90 |
44.51 |
35.75 |
36.60 |
45.56 |
39.20 |
37.71 |
30.00 |
39.47 |
37.02 |
llama-2-7b-turbomind |
29.70 |
30.23 |
24.72 |
29.67 |
34.78 |
30.07 |
31.11 |
31.16 |
30.29 |
25.50 |
31.34 |
27.31 |
llama-2-13b-turbomind |
30.83 |
32.56 |
24.16 |
42.31 |
45.41 |
32.87 |
36.67 |
45.23 |
38.29 |
33.50 |
35.17 |
34.31 |
llama-2-70b-turbomind |
53.76 |
38.14 |
30.34 |
58.79 |
65.70 |
43.82 |
51.11 |
58.29 |
49.71 |
42.00 |
49.76 |
46.28 |
llama-3-8b-turbomind |
52.63 |
42.33 |
27.53 |
51.65 |
65.70 |
44.52 |
54.44 |
51.26 |
46.86 |
43.00 |
46.41 |
45.15 |
llama-3-70b-turbomind |
72.93 |
52.56 |
32.58 |
71.98 |
83.57 |
56.88 |
69.44 |
78.89 |
76.00 |
67.50 |
57.89 |
59.14 |
internlm2-1.8b-turbomind |
51.50 |
38.14 |
25.84 |
56.04 |
71.50 |
47.32 |
35.00 |
43.72 |
42.29 |
39.00 |
41.15 |
36.57 |
internlm2-7b-turbomind |
72.56 |
53.49 |
52.25 |
79.67 |
90.82 |
62.00 |
62.78 |
64.32 |
66.86 |
59.50 |
55.74 |
53.50 |
internlm2-20b-turbomind |
74.06 |
54.42 |
56.18 |
81.87 |
92.27 |
61.77 |
68.33 |
69.85 |
68.00 |
63.50 |
60.77 |
58.92 |
qwen-1.8b-turbomind |
54.14 |
43.72 |
39.89 |
69.23 |
85.02 |
49.88 |
45.56 |
48.74 |
48.57 |
51.50 |
46.89 |
45.82 |
qwen-7b-turbomind |
71.05 |
48.37 |
53.93 |
81.87 |
93.72 |
59.67 |
54.44 |
62.31 |
58.29 |
57.50 |
50.24 |
56.66 |
qwen-14b-turbomind |
79.70 |
53.02 |
63.48 |
87.36 |
94.20 |
71.33 |
63.33 |
71.36 |
73.14 |
68.00 |
59.09 |
67.95 |
qwen-72b-turbomind |
90.23 |
77.21 |
79.21 |
91.76 |
96.14 |
77.86 |
86.11 |
85.43 |
91.43 |
90.50 |
76.08 |
86.68 |
qwen1.5-0.5b-hf |
44.36 |
36.74 |
39.33 |
58.24 |
78.26 |
43.36 |
40.00 |
45.23 |
41.71 |
42.50 |
43.54 |
43.12 |
qwen1.5-1.8b-hf |
59.40 |
47.91 |
37.08 |
72.53 |
91.30 |
53.61 |
53.33 |
51.26 |
49.71 |
58.00 |
51.20 |
56.21 |
qwen1.5-4b-hf |
65.04 |
58.60 |
55.62 |
83.52 |
94.20 |
62.00 |
63.89 |
65.33 |
65.71 |
64.00 |
55.26 |
61.40 |
qwen1.5-7b-hf |
78.57 |
66.51 |
66.85 |
87.91 |
94.69 |
68.07 |
65.00 |
64.82 |
77.14 |
77.50 |
60.77 |
74.49 |
qwen1.5-14b-hf |
83.08 |
72.09 |
70.22 |
90.11 |
94.20 |
69.46 |
73.89 |
70.35 |
82.29 |
83.00 |
65.31 |
78.33 |
qwen1.5-32b-hf |
87.59 |
78.14 |
79.78 |
92.86 |
95.65 |
78.32 |
80.56 |
79.90 |
90.29 |
89.00 |
77.27 |
86.68 |
qwen1.5-72b-hf |
91.35 |
76.74 |
79.21 |
91.76 |
96.14 |
79.25 |
85.56 |
86.93 |
92.00 |
90.00 |
75.84 |
86.91 |
qwen1.5-moe-a2-7b-hf |
88.35 |
75.81 |
51.12 |
79.12 |
94.69 |
67.37 |
80.56 |
73.37 |
87.43 |
84.00 |
78.23 |
82.39 |
mistral-7b-v0.1-hf |
40.23 |
39.07 |
24.16 |
41.21 |
52.17 |
41.49 |
45.00 |
52.26 |
45.14 |
42.00 |
42.58 |
44.02 |
mistral-7b-v0.2-hf |
36.84 |
34.88 |
23.03 |
43.96 |
52.66 |
40.79 |
50.00 |
50.75 |
45.14 |
40.50 |
42.58 |
40.86 |
mixtral-8x7b-v0.1-hf |
47.74 |
40.00 |
28.09 |
57.14 |
58.94 |
44.29 |
58.33 |
53.77 |
48.57 |
46.00 |
51.20 |
46.50 |
mixtral-8x22b-v0.1-hf |
59.02 |
41.86 |
29.78 |
60.99 |
71.01 |
50.82 |
57.78 |
67.34 |
62.29 |
52.00 |
53.35 |
55.98 |
yi-6b-hf |
85.34 |
67.91 |
53.93 |
80.22 |
91.79 |
65.97 |
72.22 |
72.36 |
82.29 |
84.50 |
69.86 |
71.56 |
yi-34b-hf |
94.36 |
76.74 |
65.73 |
87.91 |
95.17 |
79.25 |
85.56 |
90.95 |
90.86 |
92.00 |
76.79 |
82.39 |
deepseek-7b-base-hf |
65.79 |
29.30 |
32.58 |
47.80 |
67.15 |
37.76 |
44.44 |
52.26 |
43.43 |
36.50 |
41.15 |
37.02 |
deepseek-67b-base-hf |
83.83 |
58.60 |
45.51 |
79.67 |
90.34 |
62.47 |
70.56 |
70.85 |
81.14 |
71.50 |
61.72 |
60.05 |
model |
fire_engineer |
environmental_impact_assessment_engineer |
tax_accountant |
physician |
llama-7b-turbomind |
22.34 |
24.91 |
29.12 |
27.77 |
llama-13b-turbomind |
24.11 |
30.25 |
27.77 |
30.70 |
llama-30b-turbomind |
28.72 |
31.67 |
31.83 |
36.57 |
llama-65b-turbomind |
28.37 |
39.15 |
33.63 |
35.44 |
llama-2-7b-turbomind |
22.70 |
24.91 |
25.51 |
29.80 |
llama-2-13b-turbomind |
25.53 |
35.94 |
29.35 |
35.44 |
llama-2-70b-turbomind |
36.52 |
52.67 |
36.12 |
52.60 |
llama-3-8b-turbomind |
35.46 |
49.82 |
41.31 |
55.30 |
llama-3-70b-turbomind |
48.58 |
64.41 |
52.60 |
75.40 |
internlm2-1.8b-turbomind |
32.27 |
42.35 |
39.05 |
45.15 |
internlm2-7b-turbomind |
46.81 |
55.16 |
47.63 |
67.27 |
internlm2-20b-turbomind |
45.04 |
62.63 |
51.47 |
69.75 |
qwen-1.8b-turbomind |
41.84 |
47.69 |
45.60 |
57.34 |
qwen-7b-turbomind |
41.84 |
54.80 |
48.08 |
69.53 |
qwen-14b-turbomind |
45.74 |
64.77 |
56.43 |
77.88 |
qwen-72b-turbomind |
80.50 |
74.73 |
81.04 |
89.62 |
qwen1.5-0.5b-hf |
39.36 |
41.28 |
38.37 |
48.08 |
qwen1.5-1.8b-hf |
45.74 |
49.47 |
51.69 |
63.43 |
qwen1.5-4b-hf |
50.35 |
51.60 |
58.69 |
75.17 |
qwen1.5-7b-hf |
58.51 |
65.84 |
67.04 |
81.94 |
qwen1.5-14b-hf |
63.83 |
67.26 |
72.23 |
87.36 |
qwen1.5-32b-hf |
74.47 |
73.31 |
80.14 |
90.74 |
qwen1.5-72b-hf |
79.79 |
75.09 |
81.04 |
90.07 |
qwen1.5-moe-a2-7b-hf |
74.82 |
77.58 |
79.68 |
91.65 |
mistral-7b-v0.1-hf |
32.27 |
45.91 |
37.70 |
50.56 |
mistral-7b-v0.2-hf |
32.62 |
44.13 |
36.79 |
46.28 |
mixtral-8x7b-v0.1-hf |
35.11 |
53.02 |
46.73 |
52.37 |
mixtral-8x22b-v0.1-hf |
38.65 |
56.23 |
49.21 |
59.82 |
yi-6b-hf |
67.38 |
68.68 |
69.53 |
83.07 |
yi-34b-hf |
77.66 |
83.27 |
77.43 |
89.84 |
deepseek-7b-base-hf |
30.50 |
38.79 |
35.67 |
46.28 |
deepseek-67b-base-hf |
46.81 |
65.12 |
54.40 |
77.65 |
Details on Dev Split
Chat Models
model |
ceval-test |
ceval-test-hard |
ceval-test-stem |
ceval-test-social-science |
ceval-test-humanities |
ceval-test-other |
ceval-dev |
ceval-dev-hard |
ceval-dev-stem |
ceval-dev-social-science |
ceval-dev-humanities |
ceval-dev-other |
qwen1.5-0.5b-chat-hf |
36.88 |
28.83 |
34.49 |
43.46 |
37.35 |
34.76 |
38.58 |
33.90 |
33.63 |
43.81 |
41.79 |
39.59 |
qwen1.5-1.8b-chat-hf |
55.17 |
38.21 |
50.63 |
70.26 |
56.04 |
48.82 |
55.93 |
37.60 |
50.31 |
67.59 |
60.90 |
50.59 |
qwen1.5-4b-chat-hf |
61.54 |
44.79 |
56.86 |
75.84 |
62.13 |
56.46 |
62.76 |
38.32 |
55.39 |
79.53 |
65.67 |
58.00 |
qwen1.5-7b-chat-hf |
68.71 |
51.77 |
64.27 |
81.23 |
68.22 |
65.88 |
71.10 |
50.13 |
65.42 |
83.99 |
73.77 |
67.02 |
qwen1.5-14b-chat-hf |
74.80 |
56.54 |
69.46 |
87.47 |
76.46 |
71.32 |
76.35 |
52.08 |
69.68 |
86.70 |
80.56 |
74.87 |
qwen1.5-32b-chat-hf |
80.47 |
63.17 |
75.66 |
89.58 |
81.98 |
79.43 |
81.27 |
63.51 |
76.64 |
89.39 |
82.97 |
80.59 |
qwen1.5-72b-chat-hf |
81.53 |
63.62 |
75.86 |
90.74 |
83.18 |
81.84 |
82.88 |
62.44 |
77.54 |
89.80 |
86.11 |
83.07 |
qwen1.5-110b-chat-hf |
87.33 |
67.27 |
80.70 |
93.58 |
89.67 |
91.35 |
87.59 |
73.64 |
81.94 |
91.47 |
92.12 |
89.80 |
internlm2-chat-1.8b-hf |
47.04 |
34.81 |
43.28 |
59.34 |
48.24 |
41.50 |
48.51 |
36.75 |
42.23 |
57.79 |
54.83 |
45.15 |
internlm2-chat-1.8b-sft-hf |
47.19 |
35.34 |
43.49 |
59.56 |
48.30 |
41.58 |
48.75 |
35.83 |
42.04 |
59.80 |
54.84 |
44.83 |
internlm2-chat-7b-hf |
58.75 |
39.61 |
52.38 |
71.46 |
61.57 |
55.96 |
61.04 |
36.56 |
51.81 |
74.01 |
69.13 |
57.92 |
internlm2-chat-7b-sft-hf |
58.96 |
40.09 |
52.40 |
71.49 |
62.20 |
56.26 |
61.02 |
37.29 |
52.60 |
74.01 |
68.27 |
57.27 |
internlm2-chat-20b-hf |
63.12 |
42.65 |
56.21 |
75.64 |
67.15 |
60.27 |
63.45 |
34.96 |
52.84 |
79.27 |
71.50 |
60.32 |
internlm2-chat-20b-sft-hf |
63.16 |
42.70 |
56.19 |
75.74 |
67.20 |
60.37 |
63.54 |
34.96 |
52.57 |
80.33 |
71.42 |
60.34 |
llama-3-8b-instruct-hf |
50.90 |
34.54 |
46.73 |
58.73 |
49.24 |
53.04 |
52.55 |
36.37 |
48.47 |
58.03 |
53.26 |
54.26 |
llama-3-70b-instruct-hf |
67.38 |
54.02 |
65.16 |
76.83 |
62.29 |
67.92 |
67.92 |
54.50 |
66.85 |
76.80 |
65.98 |
63.72 |
llama-3-8b-instruct-lmdeploy |
49.92 |
34.75 |
46.19 |
58.49 |
47.68 |
51.14 |
50.27 |
33.32 |
46.25 |
56.93 |
49.02 |
52.76 |
llama-3-70b-instruct-lmdeploy |
66.41 |
52.76 |
64.72 |
75.31 |
61.36 |
66.44 |
68.21 |
52.28 |
65.86 |
75.06 |
68.37 |
66.09 |
mistral-7b-instruct-v0.1-hf |
36.76 |
27.76 |
35.55 |
42.41 |
34.45 |
36.12 |
40.04 |
30.21 |
35.77 |
45.15 |
40.99 |
42.22 |
mistral-7b-instruct-v0.2-hf |
40.38 |
30.26 |
38.82 |
47.66 |
37.08 |
39.91 |
43.00 |
25.97 |
38.60 |
47.44 |
48.15 |
41.82 |
mixtral-8x7b-instruct-v0.1-hf |
49.61 |
37.78 |
47.86 |
58.56 |
46.40 |
47.85 |
51.68 |
37.41 |
49.14 |
59.79 |
52.97 |
47.65 |
Details on Test Split
model |
computer_network |
operating_system |
computer_architecture |
college_programming |
college_physics |
college_chemistry |
advanced_mathematics |
probability_and_statistics |
discrete_mathematics |
electrical_engineer |
metrology_engineer |
high_school_mathematics |
qwen1.5-0.5b-chat-hf |
35.67 |
36.87 |
33.68 |
33.92 |
35.23 |
28.12 |
27.17 |
26.51 |
24.84 |
28.91 |
40.18 |
25.90 |
qwen1.5-1.8b-chat-hf |
46.78 |
47.49 |
50.78 |
39.18 |
41.48 |
31.25 |
32.95 |
27.71 |
28.10 |
34.81 |
55.71 |
27.11 |
qwen1.5-4b-chat-hf |
54.39 |
54.75 |
54.92 |
44.74 |
46.02 |
43.30 |
39.31 |
31.33 |
28.10 |
45.13 |
58.90 |
43.98 |
qwen1.5-7b-chat-hf |
60.82 |
60.34 |
63.21 |
55.85 |
48.86 |
45.09 |
46.24 |
36.14 |
39.22 |
47.49 |
70.32 |
45.78 |
qwen1.5-14b-chat-hf |
69.59 |
62.57 |
64.77 |
64.91 |
55.68 |
57.14 |
49.13 |
32.53 |
43.14 |
55.16 |
76.71 |
46.99 |
qwen1.5-32b-chat-hf |
81.87 |
74.30 |
73.58 |
71.35 |
63.07 |
60.71 |
50.87 |
46.99 |
47.06 |
59.29 |
83.11 |
60.84 |
qwen1.5-72b-chat-hf |
77.78 |
75.42 |
76.17 |
73.39 |
63.64 |
62.50 |
45.09 |
45.78 |
48.37 |
59.00 |
81.74 |
60.84 |
qwen1.5-110b-chat-hf |
83.63 |
86.03 |
81.87 |
77.49 |
76.70 |
67.86 |
49.13 |
47.59 |
55.56 |
79.94 |
95.89 |
62.05 |
internlm2-chat-1.8b-hf |
42.11 |
43.58 |
44.56 |
35.38 |
32.95 |
34.82 |
32.95 |
28.92 |
32.68 |
34.22 |
53.42 |
31.93 |
internlm2-chat-1.8b-sft-hf |
42.11 |
44.13 |
43.01 |
35.09 |
34.09 |
36.16 |
32.95 |
27.11 |
33.33 |
35.10 |
51.14 |
33.13 |
internlm2-chat-7b-hf |
59.65 |
60.89 |
58.03 |
51.46 |
36.93 |
43.75 |
36.99 |
29.52 |
36.60 |
39.82 |
63.47 |
38.55 |
internlm2-chat-7b-sft-hf |
59.06 |
61.45 |
56.48 |
52.63 |
39.77 |
41.52 |
36.99 |
27.71 |
39.22 |
40.12 |
62.10 |
40.36 |
internlm2-chat-20b-hf |
61.99 |
70.39 |
63.73 |
54.97 |
33.52 |
47.77 |
43.93 |
40.96 |
44.44 |
44.25 |
61.64 |
34.34 |
internlm2-chat-20b-sft-hf |
61.40 |
70.39 |
63.21 |
54.97 |
32.95 |
47.77 |
42.20 |
42.17 |
43.14 |
44.25 |
61.64 |
32.53 |
llama-3-8b-instruct-hf |
57.31 |
58.10 |
57.51 |
51.17 |
28.41 |
35.27 |
39.31 |
32.53 |
35.29 |
38.05 |
55.25 |
27.11 |
llama-3-70b-instruct-hf |
71.93 |
74.86 |
70.98 |
67.54 |
50.57 |
57.14 |
52.60 |
53.01 |
56.21 |
47.79 |
68.95 |
43.98 |
llama-3-8b-instruct-lmdeploy |
55.56 |
57.54 |
55.44 |
48.25 |
30.11 |
33.04 |
35.84 |
31.33 |
33.33 |
38.94 |
53.88 |
31.93 |
llama-3-70b-instruct-lmdeploy |
70.76 |
77.09 |
69.95 |
67.84 |
49.43 |
54.02 |
50.87 |
54.22 |
56.21 |
47.20 |
69.86 |
42.17 |
mistral-7b-instruct-v0.1-hf |
49.12 |
47.49 |
43.52 |
39.18 |
32.39 |
28.57 |
29.48 |
24.10 |
28.10 |
37.46 |
44.29 |
23.49 |
mistral-7b-instruct-v0.2-hf |
47.95 |
53.07 |
52.85 |
42.69 |
28.41 |
26.79 |
40.46 |
30.12 |
29.41 |
33.33 |
42.92 |
24.10 |
mixtral-8x7b-instruct-v0.1-hf |
58.48 |
62.57 |
58.03 |
56.43 |
38.64 |
36.16 |
39.31 |
34.94 |
37.91 |
34.81 |
55.71 |
28.31 |
model |
high_school_physics |
high_school_chemistry |
high_school_biology |
middle_school_mathematics |
middle_school_biology |
middle_school_physics |
middle_school_chemistry |
veterinary_medicine |
college_economics |
business_administration |
marxism |
mao_zedong_thought |
qwen1.5-0.5b-chat-hf |
30.86 |
31.98 |
44.00 |
27.68 |
47.40 |
40.45 |
55.14 |
35.24 |
32.80 |
30.56 |
58.66 |
57.53 |
qwen1.5-1.8b-chat-hf |
54.86 |
62.21 |
69.14 |
53.67 |
82.81 |
83.15 |
85.41 |
58.10 |
44.06 |
49.83 |
82.12 |
82.65 |
qwen1.5-4b-chat-hf |
58.86 |
67.44 |
80.00 |
55.93 |
89.58 |
88.20 |
88.11 |
64.29 |
47.08 |
57.48 |
86.59 |
84.93 |
qwen1.5-7b-chat-hf |
72.00 |
80.81 |
84.00 |
70.06 |
95.31 |
94.94 |
95.14 |
73.81 |
56.94 |
66.11 |
91.62 |
89.04 |
qwen1.5-14b-chat-hf |
84.00 |
83.72 |
90.29 |
80.23 |
97.92 |
94.94 |
98.38 |
81.43 |
63.18 |
74.75 |
93.30 |
96.80 |
qwen1.5-32b-chat-hf |
85.71 |
90.12 |
93.71 |
85.31 |
97.92 |
98.31 |
100.00 |
89.05 |
69.82 |
75.75 |
93.85 |
97.72 |
qwen1.5-72b-chat-hf |
88.57 |
94.19 |
94.86 |
85.31 |
97.92 |
97.75 |
98.38 |
90.48 |
71.63 |
79.73 |
93.85 |
97.72 |
qwen1.5-110b-chat-hf |
86.86 |
92.44 |
94.29 |
85.31 |
98.44 |
98.88 |
98.92 |
95.24 |
78.87 |
86.38 |
95.53 |
99.54 |
internlm2-chat-1.8b-hf |
35.43 |
48.84 |
52.00 |
35.03 |
70.31 |
67.98 |
67.03 |
41.43 |
37.83 |
36.88 |
70.95 |
60.73 |
internlm2-chat-1.8b-sft-hf |
37.71 |
48.26 |
53.14 |
34.46 |
71.35 |
67.98 |
67.57 |
41.90 |
38.63 |
37.54 |
72.63 |
60.27 |
internlm2-chat-7b-hf |
46.29 |
48.26 |
60.57 |
46.89 |
78.65 |
71.91 |
71.35 |
68.10 |
50.30 |
50.83 |
77.09 |
76.26 |
internlm2-chat-7b-sft-hf |
46.86 |
48.26 |
61.14 |
45.76 |
77.60 |
71.91 |
71.35 |
67.62 |
50.10 |
50.50 |
77.09 |
75.80 |
internlm2-chat-20b-hf |
49.71 |
46.51 |
63.43 |
55.37 |
80.73 |
74.72 |
79.46 |
72.38 |
55.73 |
59.80 |
85.47 |
76.26 |
internlm2-chat-20b-sft-hf |
53.71 |
47.09 |
64.00 |
55.37 |
80.73 |
73.60 |
78.92 |
73.81 |
55.53 |
60.13 |
85.47 |
75.80 |
llama-3-8b-instruct-hf |
38.86 |
39.53 |
50.29 |
40.11 |
65.10 |
60.11 |
63.78 |
61.43 |
47.89 |
45.85 |
69.27 |
56.16 |
llama-3-70b-instruct-hf |
63.43 |
55.23 |
69.71 |
68.36 |
85.42 |
80.90 |
78.38 |
86.19 |
69.01 |
65.12 |
83.24 |
82.65 |
llama-3-8b-instruct-lmdeploy |
41.71 |
40.70 |
52.00 |
41.24 |
61.46 |
58.43 |
65.41 |
57.62 |
45.27 |
46.18 |
69.27 |
55.71 |
llama-3-70b-instruct-lmdeploy |
61.71 |
53.49 |
70.86 |
64.97 |
88.02 |
83.71 |
77.30 |
84.76 |
68.21 |
60.80 |
80.45 |
79.91 |
mistral-7b-instruct-v0.1-hf |
27.43 |
28.49 |
36.00 |
28.25 |
40.10 |
42.70 |
43.78 |
37.14 |
32.80 |
37.87 |
41.90 |
48.86 |
mistral-7b-instruct-v0.2-hf |
33.14 |
29.65 |
44.00 |
31.07 |
47.92 |
44.94 |
49.19 |
44.29 |
37.02 |
40.86 |
53.63 |
48.40 |
mixtral-8x7b-instruct-v0.1-hf |
46.29 |
40.70 |
54.86 |
42.37 |
58.85 |
60.67 |
57.84 |
54.29 |
50.10 |
46.51 |
69.27 |
52.51 |
model |
education_science |
teacher_qualification |
high_school_politics |
high_school_geography |
middle_school_politics |
middle_school_geography |
modern_chinese_history |
ideological_and_moral_cultivation |
logic |
law |
chinese_language_and_literature |
art_studies |
qwen1.5-0.5b-chat-hf |
33.33 |
46.12 |
37.50 |
37.08 |
57.51 |
43.52 |
42.45 |
51.74 |
32.84 |
31.22 |
37.32 |
24.50 |
qwen1.5-1.8b-chat-hf |
54.07 |
72.43 |
74.43 |
66.85 |
89.12 |
87.04 |
77.36 |
76.16 |
38.24 |
44.34 |
46.89 |
40.94 |
qwen1.5-4b-chat-hf |
60.00 |
84.71 |
82.39 |
69.66 |
94.82 |
90.74 |
79.72 |
78.49 |
41.67 |
57.47 |
54.07 |
56.38 |
qwen1.5-7b-chat-hf |
66.30 |
90.73 |
84.66 |
80.90 |
94.30 |
91.67 |
82.55 |
84.88 |
38.73 |
60.18 |
60.77 |
63.42 |
qwen1.5-14b-chat-hf |
74.81 |
93.73 |
90.91 |
92.13 |
96.89 |
98.15 |
89.62 |
88.37 |
54.41 |
70.14 |
69.86 |
69.13 |
qwen1.5-32b-chat-hf |
80.37 |
94.49 |
93.75 |
94.94 |
97.93 |
97.22 |
90.09 |
90.70 |
68.63 |
78.73 |
73.21 |
77.52 |
qwen1.5-72b-chat-hf |
84.07 |
96.74 |
95.45 |
94.94 |
97.93 |
95.37 |
92.92 |
91.28 |
63.73 |
80.09 |
73.68 |
83.89 |
qwen1.5-110b-chat-hf |
90.37 |
96.99 |
96.02 |
95.51 |
98.45 |
98.15 |
93.87 |
94.19 |
81.37 |
86.88 |
84.69 |
90.94 |
internlm2-chat-1.8b-hf |
48.15 |
65.41 |
69.32 |
54.49 |
79.27 |
70.37 |
60.85 |
64.53 |
32.35 |
32.58 |
45.45 |
40.60 |
internlm2-chat-1.8b-sft-hf |
48.15 |
64.91 |
69.89 |
53.93 |
79.27 |
70.37 |
61.32 |
63.95 |
33.82 |
29.86 |
45.45 |
39.93 |
internlm2-chat-7b-hf |
66.67 |
85.21 |
73.30 |
66.85 |
91.19 |
76.85 |
70.28 |
75.58 |
42.16 |
50.68 |
60.77 |
70.47 |
internlm2-chat-7b-sft-hf |
67.04 |
85.21 |
73.86 |
66.85 |
90.67 |
77.78 |
71.70 |
75.00 |
42.16 |
51.13 |
60.29 |
72.15 |
internlm2-chat-20b-hf |
74.07 |
85.96 |
75.57 |
77.53 |
89.12 |
76.85 |
72.64 |
83.72 |
51.96 |
56.11 |
68.42 |
73.49 |
internlm2-chat-20b-sft-hf |
73.70 |
85.46 |
76.70 |
78.09 |
89.64 |
76.85 |
72.17 |
84.88 |
50.00 |
56.56 |
66.99 |
75.17 |
llama-3-8b-instruct-hf |
55.93 |
67.42 |
55.68 |
55.06 |
72.02 |
62.04 |
54.25 |
66.86 |
44.12 |
40.72 |
47.37 |
44.63 |
llama-3-70b-instruct-hf |
71.11 |
84.21 |
74.43 |
73.03 |
84.97 |
80.56 |
69.81 |
78.49 |
57.35 |
50.68 |
57.89 |
64.43 |
llama-3-8b-instruct-lmdeploy |
54.81 |
67.17 |
58.52 |
53.37 |
72.54 |
62.04 |
57.08 |
63.95 |
44.12 |
37.56 |
46.89 |
42.62 |
llama-3-70b-instruct-lmdeploy |
70.37 |
82.96 |
72.16 |
71.91 |
83.94 |
82.41 |
69.34 |
77.91 |
55.39 |
50.68 |
56.46 |
64.09 |
mistral-7b-instruct-v0.1-hf |
39.63 |
46.62 |
33.52 |
41.01 |
56.48 |
45.37 |
36.32 |
43.60 |
29.90 |
31.67 |
39.71 |
31.88 |
mistral-7b-instruct-v0.2-hf |
46.30 |
54.39 |
39.20 |
43.26 |
61.66 |
51.85 |
35.38 |
55.23 |
28.92 |
35.29 |
37.80 |
29.19 |
mixtral-8x7b-instruct-v0.1-hf |
58.52 |
66.17 |
56.82 |
57.30 |
66.32 |
62.04 |
48.11 |
66.28 |
41.67 |
37.10 |
46.41 |
35.91 |
model |
professional_tour_guide |
legal_professional |
high_school_chinese |
high_school_history |
middle_school_history |
civil_servant |
sports_science |
plant_protection |
basic_medicine |
clinical_medicine |
urban_and_rural_planner |
accountant |
qwen1.5-0.5b-chat-hf |
36.47 |
39.07 |
27.53 |
41.76 |
45.89 |
39.63 |
35.56 |
31.66 |
37.71 |
34.00 |
32.78 |
37.25 |
qwen1.5-1.8b-chat-hf |
56.02 |
45.58 |
39.33 |
67.03 |
84.54 |
49.42 |
48.89 |
51.76 |
47.43 |
50.50 |
45.69 |
52.14 |
qwen1.5-4b-chat-hf |
61.28 |
52.56 |
42.70 |
73.08 |
85.99 |
55.48 |
59.44 |
55.28 |
60.57 |
57.00 |
50.00 |
58.01 |
qwen1.5-7b-chat-hf |
73.31 |
56.28 |
58.99 |
82.97 |
88.41 |
64.57 |
66.67 |
63.82 |
77.14 |
75.50 |
57.42 |
69.07 |
qwen1.5-14b-chat-hf |
80.83 |
65.12 |
70.79 |
89.56 |
93.24 |
67.60 |
72.78 |
68.34 |
80.57 |
80.00 |
61.72 |
75.62 |
qwen1.5-32b-chat-hf |
87.59 |
72.56 |
76.40 |
90.66 |
95.65 |
74.36 |
80.00 |
80.40 |
86.86 |
84.00 |
74.88 |
85.33 |
qwen1.5-72b-chat-hf |
90.98 |
76.28 |
75.84 |
90.66 |
95.65 |
75.52 |
84.44 |
82.91 |
91.43 |
89.00 |
73.92 |
85.10 |
qwen1.5-110b-chat-hf |
95.11 |
88.37 |
82.58 |
91.76 |
96.62 |
87.65 |
91.67 |
90.95 |
93.71 |
95.00 |
87.08 |
91.87 |
internlm2-chat-1.8b-hf |
54.14 |
40.00 |
27.53 |
62.09 |
70.53 |
44.99 |
41.67 |
51.76 |
45.71 |
39.00 |
40.67 |
39.28 |
internlm2-chat-1.8b-sft-hf |
54.14 |
42.33 |
26.97 |
61.54 |
71.98 |
45.45 |
41.67 |
50.25 |
45.14 |
37.50 |
41.39 |
40.63 |
internlm2-chat-7b-hf |
70.68 |
44.19 |
34.83 |
73.63 |
84.06 |
51.98 |
57.22 |
68.34 |
66.86 |
57.50 |
54.55 |
50.11 |
internlm2-chat-7b-sft-hf |
71.80 |
44.65 |
37.64 |
73.63 |
84.06 |
51.98 |
57.78 |
67.84 |
65.71 |
60.50 |
54.55 |
50.11 |
internlm2-chat-20b-hf |
75.56 |
54.42 |
42.13 |
74.73 |
85.51 |
57.34 |
65.56 |
67.84 |
73.71 |
64.00 |
57.89 |
55.98 |
internlm2-chat-20b-sft-hf |
76.32 |
55.35 |
41.01 |
75.27 |
85.51 |
58.28 |
65.56 |
67.34 |
72.57 |
65.00 |
58.37 |
56.43 |
llama-3-8b-instruct-hf |
53.01 |
44.65 |
33.15 |
46.70 |
66.18 |
45.22 |
58.89 |
61.81 |
62.86 |
57.50 |
48.33 |
49.89 |
llama-3-70b-instruct-hf |
71.43 |
50.70 |
30.90 |
71.43 |
82.13 |
59.67 |
73.33 |
73.37 |
82.86 |
82.00 |
59.09 |
62.08 |
llama-3-8b-instruct-lmdeploy |
51.13 |
45.12 |
29.78 |
43.96 |
62.32 |
47.09 |
56.11 |
54.77 |
56.00 |
56.00 |
49.04 |
47.40 |
llama-3-70b-instruct-lmdeploy |
68.80 |
48.84 |
30.90 |
70.88 |
81.64 |
58.28 |
72.22 |
70.85 |
80.00 |
81.00 |
57.66 |
62.53 |
mistral-7b-instruct-v0.1-hf |
30.45 |
35.81 |
24.72 |
40.11 |
34.78 |
30.77 |
43.89 |
38.69 |
36.57 |
32.50 |
44.74 |
34.09 |
mistral-7b-instruct-v0.2-hf |
36.09 |
38.14 |
23.03 |
43.41 |
45.41 |
35.90 |
50.00 |
41.71 |
42.86 |
36.00 |
45.22 |
42.21 |
mixtral-8x7b-instruct-v0.1-hf |
47.37 |
44.65 |
30.34 |
51.65 |
60.87 |
42.19 |
53.89 |
58.29 |
52.00 |
47.00 |
48.56 |
44.02 |
model |
fire_engineer |
environmental_impact_assessment_engineer |
tax_accountant |
physician |
qwen1.5-0.5b-chat-hf |
27.66 |
38.43 |
32.28 |
35.44 |
qwen1.5-1.8b-chat-hf |
38.65 |
46.62 |
46.73 |
59.14 |
qwen1.5-4b-chat-hf |
49.29 |
54.80 |
51.02 |
70.20 |
qwen1.5-7b-chat-hf |
53.90 |
62.28 |
57.79 |
76.52 |
qwen1.5-14b-chat-hf |
58.87 |
65.12 |
67.27 |
86.68 |
qwen1.5-32b-chat-hf |
74.11 |
70.82 |
74.94 |
88.04 |
qwen1.5-72b-chat-hf |
74.82 |
75.09 |
78.56 |
89.39 |
qwen1.5-110b-chat-hf |
88.30 |
88.97 |
94.13 |
95.49 |
internlm2-chat-1.8b-hf |
30.14 |
41.99 |
34.54 |
46.73 |
internlm2-chat-1.8b-sft-hf |
30.14 |
43.06 |
34.31 |
47.86 |
internlm2-chat-7b-hf |
42.20 |
52.31 |
47.63 |
66.82 |
internlm2-chat-7b-sft-hf |
43.26 |
52.67 |
47.86 |
66.59 |
internlm2-chat-20b-hf |
45.74 |
54.80 |
51.02 |
69.07 |
internlm2-chat-20b-sft-hf |
45.74 |
55.16 |
51.02 |
68.62 |
llama-3-8b-instruct-hf |
37.59 |
50.53 |
42.44 |
68.40 |
llama-3-70b-instruct-hf |
50.71 |
64.06 |
55.53 |
84.42 |
llama-3-8b-instruct-lmdeploy |
37.94 |
50.53 |
41.53 |
66.14 |
llama-3-70b-instruct-lmdeploy |
48.94 |
63.70 |
53.95 |
81.72 |
mistral-7b-instruct-v0.1-hf |
27.66 |
39.15 |
29.35 |
39.95 |
mistral-7b-instruct-v0.2-hf |
32.27 |
37.01 |
32.96 |
42.89 |
mixtral-8x7b-instruct-v0.1-hf |
36.88 |
48.75 |
41.76 |
53.05 |
Details on Dev Split