GaokaoBench
python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
Base Models
model |
GaokaoBench |
llama-7b-turbomind |
14.55 |
llama-13b-turbomind |
16.20 |
llama-30b-turbomind |
16.14 |
llama-65b-turbomind |
13.31 |
llama-2-7b-turbomind |
15.02 |
llama-2-13b-turbomind |
14.86 |
llama-2-70b-turbomind |
16.36 |
llama-3-8b-turbomind |
20.88 |
llama-3-70b-turbomind |
19.98 |
internlm2-1.8b-turbomind |
23.78 |
internlm2-7b-turbomind |
41.41 |
internlm2-20b-turbomind |
58.99 |
qwen-1.8b-turbomind |
22.11 |
qwen-7b-turbomind |
35.32 |
qwen-14b-turbomind |
54.07 |
qwen-72b-turbomind |
77.56 |
qwen1.5-0.5b-hf |
30.67 |
qwen1.5-1.8b-hf |
35.66 |
qwen1.5-4b-hf |
54.31 |
qwen1.5-7b-hf |
65.99 |
qwen1.5-14b-hf |
66.60 |
qwen1.5-32b-hf |
79.01 |
qwen1.5-72b-hf |
80.26 |
qwen1.5-moe-a2-7b-hf |
52.79 |
mistral-7b-v0.1-hf |
14.35 |
mistral-7b-v0.2-hf |
11.10 |
mixtral-8x7b-v0.1-hf |
8.40 |
mixtral-8x22b-v0.1-hf |
16.23 |
yi-6b-hf |
31.70 |
yi-34b-hf |
30.51 |
deepseek-7b-base-hf |
17.02 |
deepseek-67b-base-hf |
10.14 |
Details
model |
2010-2022_Math_II_MCQs |
2010-2022_Math_I_MCQs |
2010-2022_History_MCQs |
2010-2022_Biology_MCQs |
2010-2022_Political_Science_MCQs |
2010-2022_Physics_MCQs |
2010-2022_Chemistry_MCQs |
llama-7b-turbomind |
14.22 |
13.55 |
12.54 |
18.67 |
19.06 |
2.34 |
17.74 |
llama-13b-turbomind |
18.81 |
15.89 |
21.25 |
22.67 |
15.62 |
1.56 |
25.81 |
llama-30b-turbomind |
20.64 |
19.16 |
27.18 |
16.67 |
16.56 |
2.34 |
12.10 |
llama-65b-turbomind |
21.10 |
15.89 |
11.50 |
20.00 |
5.94 |
1.56 |
21.77 |
llama-2-7b-turbomind |
16.97 |
16.36 |
20.91 |
22.00 |
18.75 |
2.34 |
11.29 |
llama-2-13b-turbomind |
14.68 |
11.68 |
26.13 |
16.00 |
17.81 |
2.34 |
20.97 |
llama-2-70b-turbomind |
18.81 |
12.15 |
26.13 |
16.00 |
20.31 |
4.69 |
16.13 |
llama-3-8b-turbomind |
4.13 |
7.94 |
37.63 |
24.67 |
26.25 |
5.47 |
21.77 |
llama-3-70b-turbomind |
4.59 |
3.12 |
20.83 |
10.94 |
18.00 |
6.25 |
15.62 |
internlm2-1.8b-turbomind |
20.64 |
22.90 |
39.72 |
30.00 |
25.94 |
10.94 |
31.45 |
internlm2-7b-turbomind |
33.94 |
35.51 |
38.33 |
59.33 |
61.56 |
2.34 |
11.29 |
internlm2-20b-turbomind |
59.17 |
51.40 |
65.16 |
74.00 |
82.19 |
28.91 |
54.03 |
qwen-1.8b-turbomind |
29.36 |
30.84 |
19.51 |
26.00 |
22.19 |
5.47 |
27.42 |
qwen-7b-turbomind |
22.48 |
28.04 |
45.64 |
43.33 |
62.19 |
3.91 |
33.87 |
qwen-14b-turbomind |
54.13 |
56.25 |
82.93 |
72.00 |
85.00 |
4.69 |
65.62 |
qwen-72b-turbomind |
73.12 |
64.49 |
91.67 |
90.62 |
58.75 |
44.53 |
79.03 |
qwen1.5-0.5b-hf |
26.61 |
32.71 |
32.40 |
34.67 |
53.44 |
10.94 |
28.23 |
qwen1.5-1.8b-hf |
36.24 |
33.18 |
56.45 |
36.00 |
49.38 |
6.25 |
33.06 |
qwen1.5-4b-hf |
45.41 |
37.85 |
68.29 |
62.00 |
87.81 |
5.47 |
47.58 |
qwen1.5-7b-hf |
56.42 |
53.74 |
85.02 |
69.33 |
86.88 |
28.12 |
70.16 |
qwen1.5-14b-hf |
69.27 |
63.08 |
54.01 |
79.33 |
76.56 |
40.62 |
79.84 |
qwen1.5-32b-hf |
71.10 |
61.68 |
92.68 |
93.33 |
95.94 |
45.31 |
83.06 |
qwen1.5-72b-hf |
71.15 |
68.22 |
94.44 |
96.67 |
95.00 |
38.28 |
75.00 |
qwen1.5-moe-a2-7b-hf |
35.32 |
29.44 |
68.64 |
44.67 |
75.00 |
17.97 |
59.68 |
mistral-7b-v0.1-hf |
13.76 |
12.15 |
9.76 |
8.00 |
5.94 |
0.00 |
17.74 |
mistral-7b-v0.2-hf |
6.88 |
5.61 |
10.45 |
12.00 |
4.06 |
0.78 |
14.52 |
mixtral-8x7b-v0.1-hf |
3.67 |
1.87 |
0.35 |
0.00 |
0.00 |
0.78 |
0.81 |
mixtral-8x22b-v0.1-hf |
16.51 |
15.89 |
1.39 |
3.33 |
9.69 |
0.00 |
13.71 |
yi-6b-hf |
6.25 |
3.12 |
40.74 |
43.75 |
35.94 |
8.59 |
31.25 |
yi-34b-hf |
12.50 |
4.17 |
31.11 |
5.00 |
20.62 |
2.34 |
0.89 |
deepseek-7b-base-hf |
14.22 |
13.08 |
25.78 |
20.67 |
20.31 |
5.47 |
18.55 |
deepseek-67b-base-hf |
3.67 |
4.21 |
8.36 |
7.33 |
4.69 |
1.56 |
4.84 |
model |
2010-2013_English_MCQs |
2010-2022_Chinese_Modern_Lit |
2010-2022_English_Fill_in_Blanks |
2012-2022_English_Cloze_Test |
2010-2022_Geography_MCQs |
2010-2022_English_Reading_Comp |
2010-2022_Chinese_Lang_and_Usage_MCQs |
llama-7b-turbomind |
19.05 |
0.00 |
15.00 |
16.15 |
22.11 |
10.43 |
15.00 |
llama-13b-turbomind |
22.86 |
0.00 |
8.50 |
8.46 |
24.21 |
9.36 |
20.00 |
llama-30b-turbomind |
28.57 |
0.00 |
6.33 |
13.85 |
23.16 |
12.98 |
12.50 |
llama-65b-turbomind |
21.90 |
0.00 |
8.00 |
13.85 |
16.84 |
12.34 |
10.00 |
llama-2-7b-turbomind |
20.95 |
0.00 |
6.17 |
12.31 |
22.11 |
11.28 |
11.25 |
llama-2-13b-turbomind |
16.19 |
0.00 |
9.83 |
13.08 |
22.11 |
7.66 |
10.00 |
llama-2-70b-turbomind |
31.43 |
0.00 |
4.17 |
13.08 |
25.26 |
20.43 |
7.50 |
llama-3-8b-turbomind |
1.90 |
1.15 |
42.00 |
7.69 |
29.47 |
17.66 |
17.50 |
llama-3-70b-turbomind |
18.75 |
3.45 |
53.67 |
76.15 |
18.60 |
36.76 |
8.75 |
internlm2-1.8b-turbomind |
33.33 |
3.45 |
15.67 |
13.85 |
32.63 |
10.43 |
25.00 |
internlm2-7b-turbomind |
61.90 |
20.69 |
57.33 |
20.77 |
61.05 |
40.21 |
47.50 |
internlm2-20b-turbomind |
72.38 |
37.93 |
62.33 |
19.23 |
74.74 |
38.51 |
48.75 |
qwen-1.8b-turbomind |
47.62 |
9.20 |
13.50 |
12.31 |
25.26 |
16.38 |
21.25 |
qwen-7b-turbomind |
42.86 |
12.64 |
35.83 |
26.15 |
51.58 |
17.87 |
30.00 |
qwen-14b-turbomind |
89.58 |
3.45 |
5.00 |
23.85 |
93.02 |
21.10 |
40.62 |
qwen-72b-turbomind |
71.43 |
81.25 |
88.17 |
96.25 |
95.79 |
79.57 |
90.00 |
qwen1.5-0.5b-hf |
40.95 |
22.99 |
21.67 |
21.54 |
38.95 |
17.02 |
22.50 |
qwen1.5-1.8b-hf |
85.71 |
29.89 |
22.17 |
30.00 |
34.74 |
20.43 |
27.50 |
qwen1.5-4b-hf |
88.57 |
35.63 |
41.00 |
67.69 |
64.21 |
41.28 |
68.75 |
qwen1.5-7b-hf |
93.33 |
14.94 |
59.33 |
70.00 |
61.05 |
67.87 |
61.25 |
qwen1.5-14b-hf |
94.29 |
16.09 |
59.67 |
76.92 |
90.53 |
59.57 |
77.50 |
qwen1.5-32b-hf |
94.29 |
43.68 |
82.83 |
38.46 |
97.89 |
75.96 |
67.50 |
qwen1.5-72b-hf |
99.05 |
28.74 |
85.62 |
77.69 |
94.74 |
72.77 |
87.50 |
qwen1.5-moe-a2-7b-hf |
65.71 |
36.78 |
51.67 |
75.38 |
72.63 |
61.28 |
33.75 |
mistral-7b-v0.1-hf |
17.14 |
8.05 |
28.33 |
6.92 |
24.21 |
30.43 |
12.50 |
mistral-7b-v0.2-hf |
7.62 |
9.20 |
23.17 |
6.15 |
25.26 |
19.15 |
7.50 |
mixtral-8x7b-v0.1-hf |
0.00 |
4.60 |
33.83 |
10.77 |
37.89 |
25.96 |
3.75 |
mixtral-8x22b-v0.1-hf |
7.62 |
4.17 |
51.33 |
14.62 |
53.68 |
21.91 |
10.00 |
yi-6b-hf |
17.14 |
52.87 |
50.83 |
36.25 |
36.84 |
48.09 |
36.25 |
yi-34b-hf |
0.00 |
59.77 |
76.67 |
86.92 |
67.44 |
61.06 |
81.25 |
deepseek-7b-base-hf |
20.95 |
2.30 |
17.83 |
12.31 |
25.26 |
12.55 |
8.75 |
deepseek-67b-base-hf |
1.90 |
9.20 |
27.33 |
30.00 |
40.00 |
13.19 |
3.75 |
Chat Models
model |
GaokaoBench |
qwen1.5-0.5b-chat-hf |
21.51 |
qwen1.5-1.8b-chat-hf |
46.19 |
qwen1.5-4b-chat-hf |
59.11 |
qwen1.5-7b-chat-hf |
70.55 |
qwen1.5-14b-chat-hf |
80.39 |
qwen1.5-32b-chat-hf |
86.15 |
qwen1.5-72b-chat-hf |
88.58 |
qwen1.5-110b-chat-hf |
89.59 |
internlm2-chat-1.8b-hf |
29.73 |
internlm2-chat-1.8b-sft-hf |
28.79 |
internlm2-chat-7b-hf |
54.54 |
internlm2-chat-7b-sft-hf |
55.39 |
internlm2-chat-20b-hf |
57.95 |
internlm2-chat-20b-sft-hf |
57.62 |
llama-3-8b-instruct-hf |
45.48 |
llama-3-70b-instruct-hf |
65.91 |
llama-3-8b-instruct-lmdeploy |
44.48 |
llama-3-70b-instruct-lmdeploy |
67.06 |
mistral-7b-instruct-v0.1-hf |
26.21 |
mistral-7b-instruct-v0.2-hf |
32.17 |
mixtral-8x7b-instruct-v0.1-hf |
42.46 |
Details
model |
2010-2022_Math_II_MCQs |
2010-2022_Math_I_MCQs |
2010-2022_History_MCQs |
2010-2022_Biology_MCQs |
2010-2022_Political_Science_MCQs |
2010-2022_Physics_MCQs |
2010-2022_Chemistry_MCQs |
qwen1.5-0.5b-chat-hf |
25.23 |
25.70 |
39.02 |
24.67 |
25.00 |
0.78 |
25.00 |
qwen1.5-1.8b-chat-hf |
30.28 |
26.64 |
61.32 |
55.33 |
77.81 |
11.72 |
40.32 |
qwen1.5-4b-chat-hf |
38.53 |
35.05 |
70.73 |
70.00 |
83.44 |
25.00 |
41.13 |
qwen1.5-7b-chat-hf |
49.54 |
39.72 |
81.88 |
82.67 |
90.62 |
46.88 |
61.29 |
qwen1.5-14b-chat-hf |
64.68 |
54.21 |
87.80 |
90.67 |
94.69 |
44.53 |
69.35 |
qwen1.5-32b-chat-hf |
70.92 |
66.14 |
98.02 |
97.74 |
96.07 |
57.81 |
72.92 |
qwen1.5-72b-chat-hf |
76.61 |
68.22 |
95.47 |
96.00 |
97.19 |
64.06 |
86.29 |
qwen1.5-110b-chat-hf |
80.36 |
66.67 |
100.00 |
100.00 |
96.25 |
65.62 |
75.00 |
internlm2-chat-1.8b-hf |
28.44 |
28.50 |
46.69 |
39.33 |
44.38 |
10.16 |
26.61 |
internlm2-chat-1.8b-sft-hf |
23.85 |
20.09 |
55.75 |
40.67 |
53.12 |
14.84 |
30.65 |
internlm2-chat-7b-hf |
45.87 |
42.52 |
77.70 |
75.33 |
76.56 |
16.41 |
38.71 |
internlm2-chat-7b-sft-hf |
49.08 |
39.72 |
80.84 |
68.67 |
81.25 |
29.69 |
42.74 |
internlm2-chat-20b-hf |
53.21 |
46.73 |
80.49 |
74.00 |
85.00 |
31.25 |
37.10 |
internlm2-chat-20b-sft-hf |
51.83 |
47.20 |
86.06 |
78.00 |
88.12 |
35.16 |
45.16 |
llama-3-8b-instruct-hf |
37.16 |
31.31 |
60.98 |
48.67 |
51.25 |
11.72 |
39.52 |
llama-3-70b-instruct-hf |
58.26 |
52.34 |
63.76 |
75.33 |
75.31 |
36.72 |
53.23 |
llama-3-8b-instruct-lmdeploy |
37.61 |
35.51 |
55.05 |
53.33 |
52.19 |
7.81 |
34.68 |
llama-3-70b-instruct-lmdeploy |
75.00 |
55.56 |
61.11 |
73.68 |
70.00 |
40.62 |
43.75 |
mistral-7b-instruct-v0.1-hf |
23.39 |
21.03 |
35.19 |
18.00 |
26.56 |
5.47 |
30.65 |
mistral-7b-instruct-v0.2-hf |
31.19 |
19.63 |
38.33 |
40.00 |
35.94 |
20.31 |
34.68 |
mixtral-8x7b-instruct-v0.1-hf |
41.28 |
37.85 |
52.26 |
47.33 |
50.00 |
25.78 |
43.55 |
model |
2010-2013_English_MCQs |
2010-2022_Chinese_Modern_Lit |
2010-2022_English_Fill_in_Blanks |
2012-2022_English_Cloze_Test |
2010-2022_Geography_MCQs |
2010-2022_English_Reading_Comp |
2010-2022_Chinese_Lang_and_Usage_MCQs |
qwen1.5-0.5b-chat-hf |
32.38 |
10.34 |
0.00 |
2.31 |
27.37 |
15.11 |
18.75 |
qwen1.5-1.8b-chat-hf |
69.52 |
42.53 |
56.33 |
2.31 |
61.05 |
32.98 |
35.00 |
qwen1.5-4b-chat-hf |
70.48 |
58.62 |
82.33 |
16.15 |
68.42 |
68.51 |
47.50 |
qwen1.5-7b-chat-hf |
83.81 |
71.26 |
85.17 |
57.69 |
81.05 |
78.94 |
66.25 |
qwen1.5-14b-chat-hf |
93.33 |
78.16 |
97.17 |
71.54 |
91.58 |
94.26 |
81.25 |
qwen1.5-32b-chat-hf |
100.00 |
81.61 |
95.83 |
90.00 |
97.89 |
92.43 |
92.86 |
qwen1.5-72b-chat-hf |
98.10 |
83.91 |
98.00 |
90.77 |
94.74 |
96.38 |
96.25 |
qwen1.5-110b-chat-hf |
100.00 |
91.95 |
98.50 |
97.69 |
95.35 |
98.44 |
100.00 |
internlm2-chat-1.8b-hf |
38.10 |
6.90 |
0.67 |
1.54 |
56.84 |
23.19 |
30.00 |
internlm2-chat-1.8b-sft-hf |
50.48 |
0.00 |
0.00 |
0.00 |
27.37 |
11.91 |
32.50 |
internlm2-chat-7b-hf |
60.95 |
67.82 |
7.00 |
7.69 |
70.53 |
79.79 |
38.75 |
internlm2-chat-7b-sft-hf |
60.00 |
71.26 |
6.50 |
0.77 |
68.42 |
77.02 |
42.50 |
internlm2-chat-20b-hf |
60.95 |
43.68 |
34.83 |
4.62 |
71.58 |
62.55 |
43.75 |
internlm2-chat-20b-sft-hf |
75.24 |
47.13 |
1.00 |
2.31 |
80.00 |
65.96 |
37.50 |
llama-3-8b-instruct-hf |
50.48 |
36.78 |
30.83 |
21.54 |
57.89 |
81.70 |
28.75 |
llama-3-70b-instruct-hf |
73.33 |
59.77 |
82.83 |
24.62 |
73.68 |
91.28 |
45.00 |
llama-3-8b-instruct-lmdeploy |
52.38 |
42.53 |
21.33 |
18.46 |
58.95 |
81.28 |
26.25 |
llama-3-70b-instruct-lmdeploy |
87.50 |
62.07 |
84.38 |
26.92 |
72.63 |
91.20 |
56.25 |
mistral-7b-instruct-v0.1-hf |
38.10 |
18.39 |
30.50 |
6.15 |
31.58 |
38.72 |
18.75 |
mistral-7b-instruct-v0.2-hf |
41.90 |
31.03 |
28.00 |
20.77 |
29.47 |
42.13 |
15.00 |
mixtral-8x7b-instruct-v0.1-hf |
49.52 |
39.08 |
41.33 |
9.23 |
44.21 |
43.19 |
21.25 |