MMLU
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
Base Models
model |
mmlu |
mmlu-stem |
mmlu-social-science |
mmlu-humanities |
mmlu-other |
llama-7b-turbomind |
35.66 |
31.22 |
37.70 |
38.90 |
37.01 |
llama-13b-turbomind |
47.76 |
37.68 |
55.36 |
52.43 |
50.83 |
llama-30b-turbomind |
58.55 |
46.95 |
67.35 |
65.13 |
60.78 |
llama-65b-turbomind |
63.78 |
52.35 |
73.68 |
70.84 |
64.29 |
llama-2-7b-turbomind |
46.78 |
37.81 |
52.11 |
51.69 |
50.04 |
llama-2-13b-turbomind |
55.76 |
44.61 |
63.86 |
62.97 |
57.35 |
llama-2-70b-turbomind |
69.87 |
58.30 |
79.86 |
75.84 |
71.58 |
llama-3-8b-turbomind |
66.43 |
55.95 |
76.11 |
70.29 |
68.96 |
llama-3-70b-turbomind |
79.35 |
70.66 |
87.54 |
83.43 |
80.42 |
internlm2-1.8b-turbomind |
45.99 |
39.63 |
51.02 |
48.65 |
47.96 |
internlm2-7b-turbomind |
65.84 |
56.48 |
74.43 |
69.68 |
67.75 |
internlm2-20b-turbomind |
67.58 |
59.01 |
76.04 |
71.20 |
68.69 |
qwen-1.8b-turbomind |
46.61 |
38.91 |
51.35 |
49.57 |
50.51 |
qwen-7b-turbomind |
59.75 |
50.16 |
67.98 |
63.48 |
62.44 |
qwen-14b-turbomind |
67.85 |
59.13 |
76.18 |
71.62 |
69.12 |
qwen-72b-turbomind |
77.36 |
68.70 |
85.28 |
80.60 |
79.45 |
qwen1.5-0.5b-hf |
39.98 |
33.96 |
45.08 |
41.59 |
42.48 |
qwen1.5-1.8b-hf |
47.14 |
39.47 |
52.70 |
49.01 |
51.33 |
qwen1.5-4b-hf |
57.03 |
47.80 |
64.86 |
60.10 |
60.20 |
qwen1.5-7b-hf |
62.15 |
53.22 |
70.25 |
65.62 |
64.26 |
qwen1.5-14b-hf |
69.10 |
61.46 |
77.57 |
71.25 |
70.29 |
qwen1.5-32b-hf |
73.88 |
65.60 |
81.41 |
77.10 |
75.79 |
qwen1.5-72b-hf |
77.02 |
69.00 |
84.55 |
80.60 |
78.21 |
qwen1.5-moe-a2-7b-hf |
62.09 |
53.27 |
70.74 |
63.80 |
65.28 |
mistral-7b-v0.1-hf |
64.04 |
53.21 |
73.65 |
68.04 |
67.00 |
mistral-7b-v0.2-hf |
63.85 |
53.21 |
72.17 |
68.40 |
67.15 |
mixtral-8x7b-v0.1-hf |
71.80 |
61.70 |
81.03 |
75.51 |
74.35 |
mixtral-8x22b-v0.1-hf |
77.67 |
68.94 |
86.81 |
81.23 |
78.43 |
yi-6b-hf |
64.08 |
52.61 |
74.10 |
68.58 |
67.11 |
yi-34b-hf |
76.26 |
66.73 |
83.74 |
81.78 |
77.77 |
deepseek-7b-base-hf |
49.22 |
40.17 |
56.73 |
53.46 |
51.26 |
deepseek-67b-base-hf |
71.95 |
60.57 |
81.69 |
77.11 |
74.42 |
Details
model |
college_biology |
college_chemistry |
college_computer_science |
college_mathematics |
college_physics |
electrical_engineering |
astronomy |
anatomy |
abstract_algebra |
machine_learning |
clinical_knowledge |
global_facts |
llama-7b-turbomind |
37.50 |
30.00 |
30.00 |
33.00 |
23.53 |
23.45 |
34.87 |
37.78 |
25.00 |
27.68 |
34.34 |
31.00 |
llama-13b-turbomind |
46.53 |
30.00 |
42.00 |
36.00 |
18.63 |
42.76 |
46.71 |
46.67 |
30.00 |
32.14 |
45.66 |
37.00 |
llama-30b-turbomind |
59.03 |
45.00 |
47.00 |
35.00 |
26.47 |
53.10 |
61.18 |
51.85 |
37.00 |
41.07 |
57.36 |
38.00 |
llama-65b-turbomind |
68.75 |
49.00 |
47.00 |
37.00 |
35.29 |
55.17 |
73.03 |
57.78 |
30.00 |
48.21 |
66.04 |
38.00 |
llama-2-7b-turbomind |
46.53 |
34.00 |
33.00 |
34.00 |
22.55 |
47.59 |
40.13 |
47.41 |
29.00 |
38.39 |
46.42 |
32.00 |
llama-2-13b-turbomind |
59.03 |
44.00 |
48.00 |
29.00 |
26.47 |
50.34 |
53.29 |
49.63 |
35.00 |
28.57 |
60.00 |
32.00 |
llama-2-70b-turbomind |
84.72 |
51.00 |
60.00 |
39.00 |
37.25 |
65.52 |
81.58 |
63.70 |
32.00 |
52.68 |
72.08 |
46.00 |
llama-3-8b-turbomind |
77.08 |
46.00 |
51.00 |
31.00 |
51.96 |
62.76 |
67.11 |
68.15 |
34.00 |
52.68 |
74.72 |
35.00 |
llama-3-70b-turbomind |
93.75 |
62.00 |
72.00 |
52.00 |
50.98 |
74.48 |
92.11 |
79.26 |
48.00 |
63.39 |
86.42 |
49.00 |
internlm2-1.8b-turbomind |
38.89 |
37.00 |
44.00 |
35.00 |
30.39 |
49.66 |
50.66 |
44.44 |
25.00 |
35.71 |
51.32 |
32.00 |
internlm2-7b-turbomind |
77.08 |
48.00 |
64.00 |
33.00 |
47.06 |
63.45 |
73.68 |
57.78 |
37.00 |
45.54 |
69.81 |
35.00 |
internlm2-20b-turbomind |
83.33 |
51.00 |
61.00 |
36.00 |
45.10 |
64.83 |
75.00 |
59.26 |
39.00 |
53.57 |
73.58 |
32.00 |
qwen-1.8b-turbomind |
42.36 |
36.00 |
39.00 |
34.00 |
27.45 |
51.03 |
50.66 |
42.96 |
31.00 |
31.25 |
53.21 |
28.00 |
qwen-7b-turbomind |
67.36 |
48.00 |
53.00 |
28.00 |
39.22 |
59.31 |
63.82 |
49.63 |
34.00 |
38.39 |
63.02 |
37.00 |
qwen-14b-turbomind |
78.47 |
51.00 |
62.00 |
42.00 |
49.02 |
65.52 |
71.05 |
60.00 |
37.00 |
58.93 |
71.32 |
40.00 |
qwen-72b-turbomind |
93.75 |
56.00 |
66.00 |
56.00 |
50.98 |
80.69 |
85.53 |
73.33 |
41.00 |
62.50 |
83.77 |
54.00 |
qwen1.5-0.5b-hf |
38.89 |
25.00 |
38.00 |
32.00 |
25.49 |
45.52 |
44.74 |
33.33 |
30.00 |
39.29 |
38.11 |
39.00 |
qwen1.5-1.8b-hf |
43.75 |
34.00 |
45.00 |
38.00 |
28.43 |
47.59 |
47.37 |
40.74 |
32.00 |
31.25 |
53.96 |
37.00 |
qwen1.5-4b-hf |
50.00 |
46.00 |
41.00 |
45.00 |
31.37 |
53.10 |
61.18 |
51.85 |
35.00 |
44.64 |
60.38 |
37.00 |
qwen1.5-7b-hf |
66.67 |
48.00 |
55.00 |
37.00 |
41.18 |
60.69 |
65.79 |
52.59 |
39.00 |
41.07 |
68.68 |
43.00 |
qwen1.5-14b-hf |
75.69 |
49.00 |
58.00 |
49.00 |
49.02 |
71.72 |
73.03 |
65.93 |
39.00 |
52.68 |
73.96 |
49.00 |
qwen1.5-32b-hf |
85.42 |
53.00 |
59.00 |
51.00 |
53.92 |
72.41 |
82.24 |
63.70 |
43.00 |
58.04 |
78.11 |
50.00 |
qwen1.5-72b-hf |
90.97 |
54.00 |
65.00 |
57.00 |
52.94 |
80.00 |
87.50 |
73.33 |
43.00 |
64.29 |
81.89 |
50.00 |
qwen1.5-moe-a2-7b-hf |
62.50 |
44.00 |
54.00 |
41.00 |
49.02 |
58.62 |
69.74 |
57.78 |
37.00 |
38.39 |
66.79 |
38.00 |
mistral-7b-v0.1-hf |
72.92 |
50.00 |
51.00 |
40.00 |
39.22 |
57.93 |
65.79 |
62.96 |
29.00 |
49.11 |
69.43 |
36.00 |
mistral-7b-v0.2-hf |
71.53 |
49.00 |
53.00 |
40.00 |
36.27 |
57.24 |
64.47 |
60.00 |
29.00 |
53.57 |
67.92 |
39.00 |
mixtral-8x7b-v0.1-hf |
85.42 |
54.00 |
62.00 |
43.00 |
46.08 |
68.97 |
82.89 |
70.37 |
37.00 |
56.25 |
79.25 |
51.00 |
mixtral-8x22b-v0.1-hf |
89.58 |
56.00 |
69.00 |
48.00 |
52.94 |
76.55 |
86.18 |
77.04 |
53.00 |
62.50 |
82.26 |
56.00 |
yi-6b-hf |
66.67 |
43.00 |
51.00 |
39.00 |
35.29 |
64.83 |
65.79 |
60.00 |
29.00 |
41.96 |
66.79 |
46.00 |
yi-34b-hf |
88.89 |
52.00 |
66.00 |
44.00 |
48.04 |
80.00 |
89.47 |
74.81 |
44.00 |
58.04 |
78.87 |
52.00 |
deepseek-7b-base-hf |
52.08 |
29.00 |
44.00 |
40.00 |
31.37 |
44.83 |
51.97 |
40.74 |
27.00 |
32.14 |
53.58 |
31.00 |
deepseek-67b-base-hf |
84.72 |
52.00 |
62.00 |
42.00 |
42.16 |
70.34 |
80.92 |
65.19 |
39.00 |
50.00 |
78.11 |
42.00 |
model |
management |
nutrition |
marketing |
professional_accounting |
high_school_geography |
international_law |
moral_scenarios |
computer_security |
high_school_microeconomics |
professional_law |
medical_genetics |
professional_psychology |
llama-7b-turbomind |
33.01 |
39.22 |
45.73 |
26.24 |
33.33 |
51.24 |
24.25 |
45.00 |
31.09 |
30.05 |
37.00 |
35.13 |
llama-13b-turbomind |
66.02 |
51.63 |
71.79 |
34.75 |
55.05 |
64.46 |
30.06 |
63.00 |
47.48 |
37.22 |
53.00 |
48.53 |
llama-30b-turbomind |
76.70 |
62.42 |
84.19 |
44.68 |
71.72 |
75.21 |
40.56 |
66.00 |
57.98 |
46.48 |
66.00 |
63.73 |
llama-65b-turbomind |
82.52 |
68.95 |
87.18 |
48.94 |
79.29 |
81.82 |
47.82 |
79.00 |
68.49 |
50.07 |
68.00 |
66.67 |
llama-2-7b-turbomind |
53.40 |
48.69 |
68.38 |
36.52 |
49.49 |
65.29 |
24.02 |
60.00 |
44.12 |
36.31 |
55.00 |
43.79 |
llama-2-13b-turbomind |
72.82 |
61.76 |
79.49 |
39.72 |
69.19 |
74.38 |
43.80 |
70.00 |
58.40 |
42.50 |
54.00 |
54.90 |
llama-2-70b-turbomind |
83.50 |
77.12 |
91.03 |
56.03 |
86.87 |
87.60 |
44.69 |
77.00 |
77.31 |
52.93 |
74.00 |
75.65 |
llama-3-8b-turbomind |
87.38 |
75.82 |
89.74 |
48.94 |
80.81 |
84.30 |
40.89 |
81.00 |
73.95 |
46.22 |
77.00 |
71.90 |
llama-3-70b-turbomind |
91.26 |
87.25 |
94.87 |
64.18 |
93.94 |
89.26 |
62.91 |
83.00 |
87.82 |
61.80 |
90.00 |
85.78 |
internlm2-1.8b-turbomind |
60.19 |
58.17 |
63.25 |
31.21 |
56.57 |
56.20 |
24.47 |
52.00 |
50.42 |
36.11 |
53.00 |
41.83 |
internlm2-7b-turbomind |
79.61 |
75.49 |
87.61 |
48.23 |
82.83 |
77.69 |
49.39 |
74.00 |
72.27 |
47.65 |
73.00 |
65.03 |
internlm2-20b-turbomind |
79.61 |
75.49 |
91.88 |
50.00 |
87.88 |
85.95 |
35.08 |
81.00 |
70.59 |
49.48 |
78.00 |
70.10 |
qwen-1.8b-turbomind |
66.02 |
60.46 |
73.50 |
38.30 |
56.57 |
66.94 |
23.91 |
56.00 |
42.02 |
33.96 |
51.00 |
39.54 |
qwen-7b-turbomind |
78.64 |
67.32 |
83.33 |
41.49 |
76.77 |
76.03 |
29.72 |
73.00 |
58.40 |
41.72 |
69.00 |
59.64 |
qwen-14b-turbomind |
78.64 |
73.86 |
88.89 |
48.58 |
83.84 |
84.30 |
45.47 |
77.00 |
73.95 |
50.85 |
74.00 |
69.61 |
qwen-72b-turbomind |
90.29 |
84.97 |
94.87 |
65.96 |
92.93 |
88.43 |
65.70 |
79.00 |
84.87 |
61.21 |
86.00 |
82.19 |
qwen1.5-0.5b-hf |
52.43 |
46.41 |
60.68 |
31.21 |
46.46 |
56.20 |
25.70 |
46.00 |
37.39 |
32.79 |
46.00 |
37.75 |
qwen1.5-1.8b-hf |
66.02 |
58.50 |
75.64 |
33.69 |
56.06 |
72.73 |
24.69 |
57.00 |
39.50 |
36.11 |
53.00 |
42.81 |
qwen1.5-4b-hf |
74.76 |
62.75 |
84.19 |
46.81 |
76.77 |
71.07 |
25.03 |
67.00 |
55.04 |
41.33 |
64.00 |
56.05 |
qwen1.5-7b-hf |
78.64 |
70.92 |
86.32 |
44.68 |
81.82 |
77.69 |
32.74 |
76.00 |
64.29 |
45.37 |
68.00 |
61.27 |
qwen1.5-14b-hf |
80.58 |
75.49 |
85.90 |
51.06 |
86.36 |
80.99 |
45.03 |
80.00 |
76.47 |
48.57 |
78.00 |
69.61 |
qwen1.5-32b-hf |
86.41 |
81.37 |
95.30 |
56.38 |
91.41 |
88.43 |
44.02 |
76.00 |
82.77 |
57.89 |
83.00 |
75.33 |
qwen1.5-72b-hf |
87.38 |
85.29 |
94.87 |
64.89 |
92.42 |
90.08 |
62.12 |
83.00 |
84.03 |
60.76 |
86.00 |
81.05 |
qwen1.5-moe-a2-7b-hf |
78.64 |
70.92 |
86.32 |
46.81 |
81.82 |
77.69 |
25.59 |
71.00 |
65.97 |
45.37 |
65.00 |
61.44 |
mistral-7b-v0.1-hf |
82.52 |
75.49 |
87.61 |
48.94 |
76.77 |
77.69 |
32.51 |
77.00 |
66.39 |
44.98 |
74.00 |
67.97 |
mistral-7b-v0.2-hf |
81.55 |
74.18 |
88.46 |
51.06 |
76.77 |
80.99 |
38.77 |
75.00 |
64.71 |
45.37 |
72.00 |
66.34 |
mixtral-8x7b-v0.1-hf |
87.38 |
81.70 |
91.88 |
51.77 |
85.86 |
85.95 |
40.11 |
80.00 |
79.41 |
53.32 |
77.00 |
77.94 |
mixtral-8x22b-v0.1-hf |
89.32 |
85.95 |
91.88 |
62.06 |
91.41 |
90.08 |
64.58 |
83.00 |
87.82 |
60.82 |
84.00 |
83.17 |
yi-6b-hf |
80.58 |
71.57 |
91.03 |
48.23 |
83.33 |
76.86 |
41.34 |
75.00 |
74.79 |
49.35 |
80.00 |
65.69 |
yi-34b-hf |
91.26 |
85.62 |
92.31 |
65.25 |
89.39 |
91.74 |
64.69 |
82.00 |
85.29 |
59.97 |
87.00 |
82.19 |
deepseek-7b-base-hf |
61.17 |
53.59 |
72.22 |
34.04 |
59.09 |
65.29 |
26.37 |
61.00 |
44.96 |
35.53 |
56.00 |
49.18 |
deepseek-67b-base-hf |
88.35 |
79.74 |
91.88 |
57.09 |
89.39 |
85.12 |
46.15 |
76.00 |
82.35 |
55.93 |
72.00 |
79.58 |
model |
jurisprudence |
world_religions |
philosophy |
virology |
high_school_chemistry |
public_relations |
high_school_macroeconomics |
human_sexuality |
elementary_mathematics |
high_school_physics |
high_school_computer_science |
high_school_european_history |
llama-7b-turbomind |
41.67 |
49.12 |
40.84 |
34.94 |
29.56 |
40.00 |
34.10 |
35.11 |
26.46 |
27.81 |
34.00 |
41.82 |
llama-13b-turbomind |
51.85 |
67.84 |
55.31 |
43.37 |
28.57 |
60.91 |
46.15 |
57.25 |
26.98 |
29.80 |
49.00 |
61.21 |
llama-30b-turbomind |
71.30 |
79.53 |
66.24 |
49.40 |
40.39 |
70.00 |
56.67 |
64.89 |
37.30 |
35.10 |
60.00 |
70.91 |
llama-65b-turbomind |
75.00 |
81.29 |
73.63 |
53.01 |
41.38 |
74.55 |
65.90 |
77.86 |
40.21 |
35.76 |
69.00 |
76.36 |
llama-2-7b-turbomind |
53.70 |
69.01 |
60.13 |
41.57 |
36.95 |
54.55 |
45.90 |
55.73 |
27.25 |
31.13 |
40.00 |
59.39 |
llama-2-13b-turbomind |
74.07 |
76.61 |
63.99 |
45.78 |
44.83 |
62.73 |
50.77 |
62.60 |
34.13 |
36.42 |
57.00 |
63.03 |
llama-2-70b-turbomind |
83.33 |
85.96 |
78.46 |
53.61 |
52.22 |
69.09 |
74.87 |
87.02 |
43.39 |
43.71 |
78.00 |
84.24 |
llama-3-8b-turbomind |
75.00 |
83.04 |
74.28 |
56.02 |
54.68 |
71.82 |
64.87 |
79.39 |
42.06 |
45.03 |
68.00 |
76.36 |
llama-3-70b-turbomind |
86.11 |
91.23 |
86.50 |
57.83 |
71.92 |
74.55 |
82.56 |
88.55 |
62.70 |
56.95 |
86.00 |
86.67 |
internlm2-1.8b-turbomind |
55.56 |
59.65 |
51.13 |
40.96 |
43.35 |
52.73 |
43.33 |
47.33 |
30.42 |
33.11 |
47.00 |
56.36 |
internlm2-7b-turbomind |
79.63 |
82.46 |
73.63 |
51.20 |
55.17 |
70.00 |
66.92 |
70.99 |
46.03 |
42.38 |
70.00 |
78.79 |
internlm2-20b-turbomind |
75.93 |
82.46 |
73.95 |
56.02 |
57.64 |
68.18 |
70.51 |
68.70 |
49.21 |
38.41 |
75.00 |
82.42 |
qwen-1.8b-turbomind |
59.26 |
56.14 |
50.80 |
40.96 |
37.93 |
60.00 |
41.03 |
51.15 |
33.33 |
34.44 |
39.00 |
64.24 |
qwen-7b-turbomind |
73.15 |
76.61 |
67.20 |
47.59 |
51.23 |
65.45 |
60.00 |
69.47 |
43.12 |
38.41 |
67.00 |
66.67 |
qwen-14b-turbomind |
76.85 |
84.21 |
72.03 |
53.01 |
65.52 |
66.36 |
66.92 |
78.63 |
51.32 |
41.72 |
72.00 |
82.42 |
qwen-72b-turbomind |
83.33 |
88.30 |
83.28 |
58.43 |
65.52 |
74.55 |
81.54 |
89.31 |
68.52 |
58.28 |
81.00 |
84.24 |
qwen1.5-0.5b-hf |
40.74 |
40.94 |
41.48 |
40.96 |
28.57 |
50.91 |
36.92 |
41.98 |
28.84 |
22.52 |
37.00 |
52.73 |
qwen1.5-1.8b-hf |
55.56 |
57.31 |
49.84 |
40.96 |
36.45 |
56.36 |
43.59 |
56.49 |
35.19 |
27.81 |
45.00 |
61.21 |
qwen1.5-4b-hf |
70.37 |
70.76 |
61.74 |
44.58 |
45.32 |
65.45 |
54.62 |
64.89 |
47.88 |
32.45 |
62.00 |
70.30 |
qwen1.5-7b-hf |
75.93 |
77.19 |
66.24 |
50.60 |
53.20 |
62.73 |
60.00 |
71.76 |
50.26 |
38.41 |
71.00 |
74.55 |
qwen1.5-14b-hf |
74.07 |
83.63 |
70.74 |
46.39 |
58.62 |
64.55 |
73.59 |
76.34 |
59.26 |
49.01 |
75.00 |
83.64 |
qwen1.5-32b-hf |
83.33 |
85.96 |
82.96 |
56.63 |
61.58 |
63.64 |
77.95 |
83.97 |
69.31 |
50.99 |
85.00 |
86.06 |
qwen1.5-72b-hf |
84.26 |
88.89 |
82.32 |
57.23 |
66.01 |
72.73 |
82.05 |
87.02 |
69.31 |
56.95 |
84.00 |
84.24 |
qwen1.5-moe-a2-7b-hf |
70.37 |
80.12 |
66.56 |
51.20 |
47.78 |
64.55 |
62.31 |
70.99 |
46.30 |
45.03 |
59.00 |
69.70 |
mistral-7b-v0.1-hf |
77.78 |
83.04 |
69.45 |
54.82 |
53.20 |
67.27 |
66.15 |
78.63 |
38.10 |
31.79 |
68.00 |
78.79 |
mistral-7b-v0.2-hf |
73.15 |
82.46 |
72.99 |
53.01 |
55.67 |
66.36 |
62.31 |
77.10 |
40.48 |
34.44 |
66.00 |
76.36 |
mixtral-8x7b-v0.1-hf |
82.41 |
88.30 |
78.14 |
51.20 |
62.56 |
70.00 |
70.77 |
80.92 |
48.68 |
48.34 |
71.00 |
80.61 |
mixtral-8x22b-v0.1-hf |
84.26 |
89.47 |
84.57 |
59.04 |
67.49 |
78.18 |
79.23 |
88.55 |
61.64 |
52.98 |
87.00 |
86.06 |
yi-6b-hf |
78.70 |
81.87 |
69.77 |
46.39 |
52.71 |
73.64 |
65.13 |
74.81 |
46.30 |
38.41 |
66.00 |
71.52 |
yi-34b-hf |
89.81 |
86.55 |
83.92 |
57.23 |
64.04 |
73.64 |
79.49 |
85.50 |
66.40 |
52.32 |
81.00 |
86.06 |
deepseek-7b-base-hf |
55.56 |
73.10 |
56.59 |
46.99 |
34.98 |
62.73 |
48.21 |
58.78 |
28.57 |
29.14 |
50.00 |
61.82 |
deepseek-67b-base-hf |
84.26 |
85.96 |
81.03 |
56.02 |
57.64 |
72.73 |
73.85 |
82.44 |
51.59 |
45.03 |
74.00 |
81.82 |
model |
business_ethics |
moral_disputes |
high_school_statistics |
miscellaneous |
formal_logic |
high_school_government_and_politics |
prehistory |
security_studies |
high_school_biology |
logical_fallacies |
high_school_world_history |
professional_medicine |
llama-7b-turbomind |
42.00 |
40.46 |
32.87 |
42.78 |
26.19 |
46.11 |
35.19 |
33.47 |
32.90 |
42.33 |
43.88 |
43.75 |
llama-13b-turbomind |
46.00 |
50.00 |
30.56 |
64.88 |
31.75 |
66.84 |
51.85 |
52.65 |
51.94 |
52.76 |
67.51 |
51.10 |
llama-30b-turbomind |
55.00 |
66.76 |
49.07 |
77.91 |
36.51 |
82.90 |
68.21 |
66.12 |
69.35 |
67.48 |
80.59 |
55.88 |
llama-65b-turbomind |
59.00 |
73.70 |
61.57 |
81.35 |
43.65 |
88.60 |
73.46 |
71.84 |
74.19 |
77.30 |
83.97 |
62.13 |
llama-2-7b-turbomind |
53.00 |
51.16 |
27.78 |
63.60 |
27.78 |
67.36 |
48.77 |
47.76 |
50.97 |
51.53 |
64.56 |
52.57 |
llama-2-13b-turbomind |
54.00 |
64.45 |
45.37 |
74.46 |
36.51 |
80.83 |
64.81 |
62.86 |
67.42 |
66.87 |
72.15 |
54.41 |
llama-2-70b-turbomind |
72.00 |
77.17 |
63.43 |
86.08 |
48.41 |
94.30 |
83.64 |
78.37 |
81.61 |
80.98 |
87.76 |
74.63 |
llama-3-8b-turbomind |
62.00 |
73.70 |
54.17 |
82.76 |
48.41 |
90.16 |
72.53 |
75.51 |
77.74 |
73.01 |
82.70 |
72.06 |
llama-3-70b-turbomind |
83.00 |
85.55 |
72.22 |
92.21 |
66.67 |
97.41 |
91.05 |
84.90 |
90.32 |
87.73 |
94.09 |
87.13 |
internlm2-1.8b-turbomind |
44.00 |
45.95 |
38.89 |
59.39 |
32.54 |
60.62 |
50.31 |
54.29 |
52.58 |
45.40 |
62.87 |
37.87 |
internlm2-7b-turbomind |
69.00 |
66.76 |
57.87 |
80.72 |
50.00 |
90.16 |
73.15 |
75.10 |
79.68 |
68.71 |
81.01 |
70.22 |
internlm2-20b-turbomind |
74.00 |
74.57 |
60.19 |
81.48 |
44.44 |
91.71 |
75.31 |
81.63 |
82.58 |
75.46 |
87.76 |
63.60 |
qwen-1.8b-turbomind |
52.00 |
52.31 |
34.72 |
57.98 |
29.37 |
59.07 |
47.22 |
48.57 |
52.26 |
44.17 |
61.18 |
43.38 |
qwen-7b-turbomind |
68.00 |
64.74 |
45.37 |
77.39 |
43.65 |
83.94 |
68.21 |
70.20 |
72.26 |
65.64 |
75.95 |
58.46 |
qwen-14b-turbomind |
75.00 |
74.86 |
57.87 |
84.04 |
51.59 |
91.71 |
70.99 |
77.14 |
83.55 |
73.01 |
83.12 |
67.65 |
qwen-72b-turbomind |
80.00 |
84.97 |
68.98 |
91.44 |
54.76 |
98.96 |
87.04 |
81.63 |
89.03 |
84.05 |
90.30 |
84.93 |
qwen1.5-0.5b-hf |
47.00 |
46.82 |
23.15 |
48.02 |
29.37 |
48.70 |
40.12 |
38.37 |
40.65 |
35.58 |
53.16 |
31.62 |
qwen1.5-1.8b-hf |
54.00 |
54.91 |
28.70 |
61.69 |
23.81 |
58.03 |
48.15 |
51.84 |
55.48 |
45.40 |
59.92 |
39.71 |
qwen1.5-4b-hf |
65.00 |
66.76 |
44.44 |
73.95 |
35.71 |
78.24 |
60.19 |
65.31 |
66.45 |
65.64 |
71.31 |
50.00 |
qwen1.5-7b-hf |
68.00 |
70.81 |
48.61 |
76.50 |
38.89 |
84.97 |
69.44 |
68.16 |
74.52 |
68.10 |
77.22 |
56.25 |
qwen1.5-14b-hf |
77.00 |
73.70 |
62.96 |
83.40 |
53.17 |
90.67 |
71.60 |
80.82 |
84.52 |
76.69 |
83.54 |
71.69 |
qwen1.5-32b-hf |
77.00 |
78.90 |
68.98 |
88.12 |
54.76 |
94.82 |
81.48 |
80.82 |
88.39 |
82.21 |
86.08 |
80.88 |
qwen1.5-72b-hf |
80.00 |
84.39 |
68.98 |
91.44 |
55.56 |
98.96 |
86.73 |
81.63 |
88.71 |
85.89 |
89.87 |
82.72 |
qwen1.5-moe-a2-7b-hf |
74.00 |
65.90 |
56.48 |
82.25 |
34.13 |
84.46 |
70.68 |
74.29 |
73.23 |
68.10 |
76.79 |
66.91 |
mistral-7b-v0.1-hf |
57.00 |
71.10 |
57.41 |
81.61 |
40.48 |
86.53 |
73.46 |
72.65 |
76.77 |
79.14 |
77.22 |
68.75 |
mistral-7b-v0.2-hf |
61.00 |
71.39 |
52.78 |
80.08 |
40.48 |
88.08 |
69.44 |
72.24 |
76.13 |
77.91 |
78.06 |
70.59 |
mixtral-8x7b-v0.1-hf |
77.00 |
80.06 |
63.43 |
87.87 |
54.76 |
93.26 |
83.95 |
80.00 |
84.19 |
79.14 |
88.61 |
81.25 |
mixtral-8x22b-v0.1-hf |
72.00 |
84.10 |
68.52 |
90.68 |
57.14 |
96.37 |
86.73 |
86.53 |
90.32 |
87.73 |
90.30 |
87.87 |
yi-6b-hf |
67.00 |
69.36 |
52.78 |
80.46 |
44.44 |
89.64 |
70.99 |
74.69 |
77.10 |
78.53 |
78.90 |
65.81 |
yi-34b-hf |
79.00 |
83.82 |
66.67 |
90.29 |
57.14 |
97.93 |
87.65 |
84.90 |
88.39 |
87.73 |
92.83 |
81.99 |
deepseek-7b-base-hf |
49.00 |
52.31 |
41.20 |
66.28 |
30.95 |
63.73 |
55.86 |
51.84 |
52.90 |
58.90 |
62.45 |
45.22 |
deepseek-67b-base-hf |
81.00 |
77.17 |
63.89 |
90.04 |
53.17 |
97.93 |
85.49 |
73.88 |
82.26 |
84.05 |
91.56 |
78.31 |
model |
high_school_mathematics |
college_medicine |
high_school_us_history |
sociology |
econometrics |
high_school_psychology |
human_aging |
us_foreign_policy |
llama-7b-turbomind |
24.81 |
32.95 |
38.73 |
45.77 |
27.19 |
48.07 |
38.12 |
43.00 |
llama-13b-turbomind |
26.30 |
42.20 |
59.80 |
61.19 |
28.95 |
61.28 |
53.36 |
78.00 |
llama-30b-turbomind |
27.41 |
54.91 |
76.96 |
79.10 |
35.96 |
76.15 |
67.71 |
83.00 |
llama-65b-turbomind |
34.44 |
54.34 |
82.84 |
81.09 |
39.47 |
82.39 |
66.37 |
88.00 |
llama-2-7b-turbomind |
29.63 |
43.35 |
60.29 |
62.69 |
27.19 |
62.75 |
56.05 |
64.00 |
llama-2-13b-turbomind |
27.04 |
52.60 |
75.49 |
73.13 |
32.46 |
76.51 |
64.57 |
82.00 |
llama-2-70b-turbomind |
34.07 |
64.16 |
90.69 |
90.55 |
44.74 |
87.52 |
80.27 |
92.00 |
llama-3-8b-turbomind |
38.15 |
64.16 |
83.33 |
86.57 |
47.37 |
84.04 |
70.85 |
87.00 |
llama-3-70b-turbomind |
48.89 |
79.77 |
95.10 |
94.03 |
72.81 |
94.13 |
82.51 |
94.00 |
internlm2-1.8b-turbomind |
30.37 |
41.04 |
55.88 |
51.74 |
28.95 |
61.47 |
51.12 |
63.00 |
internlm2-7b-turbomind |
39.63 |
68.21 |
76.96 |
84.58 |
44.74 |
84.59 |
72.65 |
86.00 |
internlm2-20b-turbomind |
39.63 |
66.47 |
82.84 |
85.07 |
47.37 |
86.79 |
70.85 |
84.00 |
qwen-1.8b-turbomind |
28.52 |
43.35 |
54.90 |
60.70 |
36.84 |
60.73 |
48.43 |
60.00 |
qwen-7b-turbomind |
30.00 |
57.23 |
75.98 |
79.10 |
32.46 |
79.27 |
63.23 |
81.00 |
qwen-14b-turbomind |
37.41 |
70.52 |
81.37 |
85.07 |
50.00 |
84.95 |
73.09 |
86.00 |
qwen-72b-turbomind |
50.00 |
75.72 |
92.16 |
90.05 |
59.65 |
92.66 |
82.51 |
95.00 |
qwen1.5-0.5b-hf |
29.63 |
33.53 |
45.10 |
59.70 |
28.95 |
44.77 |
37.22 |
69.00 |
qwen1.5-1.8b-hf |
34.07 |
39.31 |
47.55 |
63.18 |
32.46 |
59.08 |
53.81 |
73.00 |
qwen1.5-4b-hf |
35.93 |
55.49 |
71.08 |
73.13 |
37.72 |
72.11 |
63.68 |
79.00 |
qwen1.5-7b-hf |
34.81 |
61.85 |
78.92 |
82.09 |
41.23 |
80.73 |
61.88 |
84.00 |
qwen1.5-14b-hf |
45.93 |
68.21 |
80.88 |
83.08 |
55.26 |
86.06 |
73.09 |
88.00 |
qwen1.5-32b-hf |
47.04 |
76.30 |
90.20 |
86.07 |
57.89 |
90.28 |
75.78 |
92.00 |
qwen1.5-72b-hf |
47.78 |
75.14 |
92.65 |
88.56 |
59.65 |
92.48 |
79.82 |
94.00 |
qwen1.5-moe-a2-7b-hf |
46.30 |
54.91 |
78.43 |
79.10 |
38.60 |
82.39 |
66.82 |
83.00 |
mistral-7b-v0.1-hf |
33.70 |
65.32 |
78.92 |
83.08 |
50.00 |
82.39 |
69.51 |
86.00 |
mistral-7b-v0.2-hf |
38.15 |
64.16 |
81.86 |
82.09 |
43.86 |
80.18 |
69.96 |
86.00 |
mixtral-8x7b-v0.1-hf |
40.37 |
69.94 |
86.27 |
88.56 |
65.79 |
88.81 |
79.37 |
91.00 |
mixtral-8x22b-v0.1-hf |
45.93 |
79.19 |
90.20 |
93.03 |
70.18 |
92.29 |
79.37 |
95.00 |
yi-6b-hf |
32.59 |
61.27 |
79.90 |
82.59 |
35.96 |
82.94 |
67.26 |
86.00 |
yi-34b-hf |
45.19 |
71.68 |
91.18 |
88.56 |
55.26 |
91.74 |
78.48 |
91.00 |
deepseek-7b-base-hf |
28.89 |
41.62 |
60.29 |
70.15 |
26.32 |
69.72 |
55.61 |
76.00 |
deepseek-67b-base-hf |
38.89 |
72.25 |
90.69 |
90.05 |
52.63 |
90.46 |
80.72 |
95.00 |
Chat Models
model |
mmlu |
mmlu-stem |
mmlu-social-science |
mmlu-humanities |
mmlu-other |
qwen1.5-0.5b-chat-hf |
35.32 |
30.90 |
37.59 |
37.29 |
37.73 |
qwen1.5-1.8b-chat-hf |
45.62 |
39.20 |
49.21 |
47.67 |
49.63 |
qwen1.5-4b-chat-hf |
55.90 |
48.07 |
62.67 |
59.70 |
57.31 |
qwen1.5-7b-chat-hf |
61.79 |
52.68 |
69.41 |
66.41 |
63.45 |
qwen1.5-14b-chat-hf |
67.96 |
59.79 |
75.46 |
71.23 |
69.72 |
qwen1.5-32b-chat-hf |
75.36 |
67.04 |
82.11 |
80.44 |
76.23 |
qwen1.5-72b-chat-hf |
77.24 |
69.59 |
83.95 |
81.58 |
77.87 |
qwen1.5-110b-chat-hf |
77.95 |
71.56 |
83.77 |
81.44 |
78.41 |
internlm2-chat-1.8b-hf |
47.58 |
40.88 |
53.33 |
49.92 |
49.74 |
internlm2-chat-1.8b-sft-hf |
47.44 |
40.55 |
53.31 |
49.67 |
49.89 |
internlm2-chat-7b-hf |
63.05 |
53.42 |
71.47 |
67.27 |
65.13 |
internlm2-chat-7b-sft-hf |
63.33 |
53.95 |
71.74 |
67.62 |
65.00 |
internlm2-chat-20b-hf |
67.37 |
57.39 |
75.75 |
71.63 |
69.95 |
internlm2-chat-20b-sft-hf |
67.34 |
57.49 |
75.67 |
70.99 |
70.40 |
llama-3-8b-instruct-hf |
68.37 |
58.01 |
77.82 |
71.22 |
71.94 |
llama-3-70b-instruct-hf |
80.93 |
73.86 |
87.71 |
83.90 |
82.01 |
llama-3-8b-instruct-lmdeploy |
67.35 |
56.66 |
75.96 |
70.90 |
71.49 |
llama-3-70b-instruct-lmdeploy |
80.85 |
74.07 |
87.26 |
83.73 |
81.96 |
mistral-7b-instruct-v0.1-hf |
54.36 |
43.74 |
62.96 |
58.87 |
57.46 |
mistral-7b-instruct-v0.2-hf |
59.98 |
49.56 |
69.22 |
64.41 |
62.24 |
mixtral-8x7b-instruct-v0.1-hf |
70.11 |
60.29 |
79.01 |
74.08 |
72.28 |
Details
model |
college_biology |
college_chemistry |
college_computer_science |
college_mathematics |
college_physics |
electrical_engineering |
astronomy |
anatomy |
abstract_algebra |
machine_learning |
clinical_knowledge |
global_facts |
qwen1.5-0.5b-chat-hf |
31.25 |
32.00 |
33.00 |
29.00 |
33.33 |
38.62 |
33.55 |
28.89 |
20.00 |
27.68 |
40.38 |
33.00 |
qwen1.5-1.8b-chat-hf |
42.36 |
28.00 |
45.00 |
33.00 |
27.45 |
44.83 |
51.97 |
42.22 |
32.00 |
38.39 |
48.30 |
30.00 |
qwen1.5-4b-chat-hf |
56.25 |
47.00 |
49.00 |
39.00 |
36.27 |
54.48 |
57.89 |
49.63 |
38.00 |
33.04 |
59.62 |
23.00 |
qwen1.5-7b-chat-hf |
64.58 |
51.00 |
59.00 |
37.00 |
41.18 |
53.79 |
66.45 |
53.33 |
43.00 |
41.07 |
67.92 |
36.00 |
qwen1.5-14b-chat-hf |
77.08 |
51.00 |
64.00 |
42.00 |
45.10 |
64.83 |
77.63 |
65.93 |
39.00 |
46.43 |
73.21 |
45.00 |
qwen1.5-32b-chat-hf |
84.72 |
53.00 |
57.00 |
48.00 |
52.94 |
74.48 |
82.24 |
67.41 |
52.00 |
61.61 |
78.11 |
48.00 |
qwen1.5-72b-chat-hf |
90.97 |
57.00 |
66.00 |
55.00 |
55.88 |
80.00 |
88.16 |
72.59 |
56.00 |
59.82 |
80.00 |
51.00 |
qwen1.5-110b-chat-hf |
88.89 |
62.00 |
66.00 |
64.00 |
58.82 |
75.86 |
89.47 |
68.15 |
59.00 |
63.39 |
79.62 |
59.00 |
internlm2-chat-1.8b-hf |
49.31 |
36.00 |
47.00 |
33.00 |
36.27 |
42.76 |
48.03 |
49.63 |
30.00 |
33.93 |
53.58 |
28.00 |
internlm2-chat-1.8b-sft-hf |
51.39 |
37.00 |
50.00 |
33.00 |
33.33 |
42.76 |
46.05 |
49.63 |
31.00 |
32.14 |
53.21 |
29.00 |
internlm2-chat-7b-hf |
68.75 |
47.00 |
62.00 |
32.00 |
38.24 |
57.24 |
69.74 |
58.52 |
29.00 |
53.57 |
70.19 |
41.00 |
internlm2-chat-7b-sft-hf |
71.53 |
47.00 |
63.00 |
34.00 |
37.25 |
57.24 |
69.74 |
57.78 |
29.00 |
52.68 |
69.43 |
34.00 |
internlm2-chat-20b-hf |
76.39 |
51.00 |
61.00 |
37.00 |
40.20 |
62.76 |
78.95 |
67.41 |
33.00 |
46.43 |
75.09 |
42.00 |
internlm2-chat-20b-sft-hf |
77.08 |
49.00 |
60.00 |
39.00 |
39.22 |
64.14 |
79.61 |
68.15 |
35.00 |
46.43 |
75.09 |
42.00 |
llama-3-8b-instruct-hf |
81.94 |
48.00 |
58.00 |
43.00 |
48.04 |
60.69 |
76.32 |
71.11 |
33.00 |
54.46 |
73.58 |
46.00 |
llama-3-70b-instruct-hf |
93.06 |
56.00 |
70.00 |
60.00 |
60.78 |
77.24 |
93.42 |
79.26 |
53.00 |
71.43 |
86.42 |
66.00 |
llama-3-8b-instruct-lmdeploy |
79.17 |
47.00 |
53.00 |
36.00 |
49.02 |
60.00 |
73.68 |
68.89 |
36.00 |
55.36 |
73.96 |
42.00 |
llama-3-70b-instruct-lmdeploy |
93.75 |
57.00 |
66.00 |
61.00 |
65.69 |
77.93 |
92.11 |
78.52 |
55.00 |
70.54 |
86.42 |
64.00 |
mistral-7b-instruct-v0.1-hf |
57.64 |
35.00 |
50.00 |
31.00 |
24.51 |
51.72 |
58.55 |
45.93 |
35.00 |
41.07 |
56.98 |
32.00 |
mistral-7b-instruct-v0.2-hf |
70.14 |
42.00 |
49.00 |
35.00 |
43.14 |
54.48 |
65.79 |
56.30 |
29.00 |
42.86 |
65.28 |
37.00 |
mixtral-8x7b-instruct-v0.1-hf |
81.25 |
57.00 |
57.00 |
40.00 |
50.00 |
60.69 |
80.92 |
65.93 |
45.00 |
50.89 |
76.60 |
41.00 |
model |
management |
nutrition |
marketing |
professional_accounting |
high_school_geography |
international_law |
moral_scenarios |
computer_security |
high_school_microeconomics |
professional_law |
medical_genetics |
professional_psychology |
qwen1.5-0.5b-chat-hf |
41.75 |
38.89 |
49.15 |
26.60 |
48.48 |
50.41 |
24.69 |
42.00 |
32.35 |
31.75 |
31.00 |
32.35 |
qwen1.5-1.8b-chat-hf |
62.14 |
55.56 |
76.92 |
34.40 |
58.08 |
61.16 |
21.90 |
56.00 |
42.44 |
35.14 |
50.00 |
44.93 |
qwen1.5-4b-chat-hf |
73.79 |
58.50 |
82.05 |
47.16 |
74.24 |
71.90 |
32.29 |
69.00 |
58.40 |
40.74 |
58.00 |
53.76 |
qwen1.5-7b-chat-hf |
79.61 |
69.28 |
85.47 |
41.49 |
78.79 |
76.86 |
35.75 |
74.00 |
65.13 |
44.78 |
68.00 |
57.68 |
qwen1.5-14b-chat-hf |
82.52 |
70.26 |
87.18 |
51.77 |
85.86 |
82.64 |
53.74 |
81.00 |
76.05 |
47.98 |
76.00 |
67.48 |
qwen1.5-32b-chat-hf |
84.47 |
77.78 |
94.44 |
60.99 |
90.91 |
87.60 |
72.96 |
79.00 |
83.61 |
58.28 |
83.00 |
77.94 |
qwen1.5-72b-chat-hf |
89.32 |
85.95 |
93.59 |
61.35 |
90.91 |
86.78 |
75.98 |
83.00 |
84.87 |
60.30 |
83.00 |
81.05 |
qwen1.5-110b-chat-hf |
86.41 |
80.72 |
92.74 |
69.15 |
93.94 |
84.30 |
77.88 |
83.00 |
88.66 |
61.73 |
84.00 |
82.19 |
internlm2-chat-1.8b-hf |
72.82 |
50.65 |
69.23 |
35.46 |
56.06 |
56.20 |
27.82 |
60.00 |
49.16 |
33.83 |
54.00 |
43.79 |
internlm2-chat-1.8b-sft-hf |
71.84 |
52.61 |
68.80 |
34.75 |
55.56 |
53.72 |
27.04 |
58.00 |
48.74 |
34.09 |
54.00 |
44.61 |
internlm2-chat-7b-hf |
78.64 |
66.67 |
85.90 |
46.81 |
79.29 |
70.25 |
35.31 |
79.00 |
68.07 |
46.41 |
68.00 |
64.87 |
internlm2-chat-7b-sft-hf |
79.61 |
67.97 |
86.75 |
47.52 |
80.30 |
70.25 |
35.98 |
80.00 |
69.33 |
45.83 |
70.00 |
65.36 |
internlm2-chat-20b-hf |
80.58 |
75.16 |
90.17 |
52.13 |
83.84 |
80.99 |
39.33 |
80.00 |
70.59 |
49.67 |
75.00 |
70.26 |
internlm2-chat-20b-sft-hf |
80.58 |
76.14 |
91.03 |
53.19 |
84.34 |
80.99 |
36.31 |
77.00 |
71.85 |
49.61 |
77.00 |
70.59 |
llama-3-8b-instruct-hf |
82.52 |
79.41 |
91.45 |
52.48 |
80.30 |
79.34 |
46.26 |
75.00 |
76.89 |
49.61 |
85.00 |
72.22 |
llama-3-70b-instruct-hf |
89.32 |
87.58 |
93.16 |
66.67 |
92.42 |
90.08 |
76.20 |
83.00 |
89.50 |
64.67 |
92.00 |
87.09 |
llama-3-8b-instruct-lmdeploy |
87.38 |
79.41 |
90.17 |
52.48 |
79.80 |
78.51 |
44.25 |
75.00 |
74.37 |
48.76 |
84.00 |
69.61 |
llama-3-70b-instruct-lmdeploy |
90.29 |
88.56 |
93.59 |
65.96 |
92.93 |
89.26 |
75.75 |
83.00 |
89.92 |
63.95 |
92.00 |
86.60 |
mistral-7b-instruct-v0.1-hf |
69.90 |
59.80 |
85.47 |
38.65 |
69.70 |
65.29 |
37.54 |
69.00 |
51.26 |
37.81 |
65.00 |
52.45 |
mistral-7b-instruct-v0.2-hf |
74.76 |
66.99 |
88.89 |
43.97 |
75.25 |
76.86 |
42.01 |
73.00 |
62.61 |
42.24 |
67.00 |
62.25 |
mixtral-8x7b-instruct-v0.1-hf |
85.44 |
80.39 |
92.74 |
55.32 |
85.35 |
82.64 |
48.38 |
78.00 |
75.21 |
53.52 |
75.00 |
74.02 |
model |
jurisprudence |
world_religions |
philosophy |
virology |
high_school_chemistry |
public_relations |
high_school_macroeconomics |
human_sexuality |
elementary_mathematics |
high_school_physics |
high_school_computer_science |
high_school_european_history |
qwen1.5-0.5b-chat-hf |
42.59 |
24.56 |
39.87 |
39.76 |
29.06 |
38.18 |
35.64 |
38.93 |
27.78 |
29.80 |
34.00 |
48.48 |
qwen1.5-1.8b-chat-hf |
50.93 |
56.73 |
44.37 |
42.77 |
35.96 |
51.82 |
38.46 |
49.62 |
35.45 |
27.15 |
47.00 |
63.03 |
qwen1.5-4b-chat-hf |
71.30 |
65.50 |
58.20 |
50.00 |
44.33 |
57.27 |
54.10 |
61.83 |
43.65 |
41.06 |
60.00 |
72.12 |
qwen1.5-7b-chat-hf |
76.85 |
76.61 |
68.49 |
48.80 |
51.72 |
64.55 |
59.23 |
68.70 |
48.94 |
37.09 |
69.00 |
79.39 |
qwen1.5-14b-chat-hf |
75.93 |
80.70 |
69.13 |
51.20 |
55.67 |
64.55 |
67.69 |
74.05 |
57.14 |
47.02 |
74.00 |
82.42 |
qwen1.5-32b-chat-hf |
83.33 |
89.47 |
82.64 |
60.84 |
62.56 |
70.00 |
76.67 |
83.21 |
67.46 |
59.60 |
85.00 |
84.85 |
qwen1.5-72b-chat-hf |
86.11 |
89.47 |
80.71 |
59.04 |
68.47 |
72.73 |
80.00 |
87.79 |
67.72 |
52.32 |
79.00 |
85.45 |
qwen1.5-110b-chat-hf |
83.33 |
87.13 |
81.03 |
54.22 |
69.95 |
73.64 |
78.21 |
87.02 |
75.93 |
57.62 |
84.00 |
88.48 |
internlm2-chat-1.8b-hf |
52.78 |
60.82 |
49.20 |
42.77 |
42.36 |
50.00 |
47.18 |
53.44 |
32.54 |
31.79 |
39.00 |
60.00 |
internlm2-chat-1.8b-sft-hf |
53.70 |
61.40 |
50.16 |
42.17 |
40.89 |
50.00 |
47.69 |
51.15 |
32.54 |
29.14 |
40.00 |
59.39 |
internlm2-chat-7b-hf |
73.15 |
81.87 |
67.85 |
47.59 |
49.75 |
62.73 |
61.79 |
66.41 |
44.97 |
33.77 |
71.00 |
81.82 |
internlm2-chat-7b-sft-hf |
73.15 |
81.87 |
66.88 |
48.19 |
48.77 |
63.64 |
62.31 |
65.65 |
45.77 |
33.77 |
72.00 |
81.82 |
internlm2-chat-20b-hf |
80.56 |
81.87 |
72.99 |
55.42 |
54.19 |
70.00 |
67.95 |
71.76 |
48.15 |
39.74 |
75.00 |
80.00 |
internlm2-chat-20b-sft-hf |
81.48 |
79.53 |
72.99 |
54.82 |
54.19 |
69.09 |
67.95 |
71.76 |
48.94 |
41.06 |
75.00 |
80.00 |
llama-3-8b-instruct-hf |
76.85 |
79.53 |
72.35 |
53.61 |
54.19 |
70.91 |
66.41 |
80.92 |
49.47 |
46.36 |
71.00 |
75.15 |
llama-3-70b-instruct-hf |
87.04 |
88.30 |
82.64 |
56.02 |
67.49 |
74.55 |
86.41 |
88.55 |
74.34 |
65.56 |
91.00 |
86.06 |
llama-3-8b-instruct-lmdeploy |
77.78 |
79.53 |
70.74 |
52.41 |
53.20 |
68.18 |
65.38 |
79.39 |
50.79 |
37.75 |
72.00 |
76.97 |
llama-3-70b-instruct-lmdeploy |
87.96 |
90.64 |
83.28 |
54.82 |
69.46 |
73.64 |
86.92 |
87.02 |
74.87 |
66.23 |
92.00 |
85.45 |
mistral-7b-instruct-v0.1-hf |
64.81 |
70.18 |
63.67 |
41.57 |
38.92 |
68.18 |
49.49 |
61.83 |
33.33 |
32.45 |
55.00 |
66.67 |
mistral-7b-instruct-v0.2-hf |
70.37 |
80.12 |
64.95 |
50.60 |
50.74 |
68.18 |
54.36 |
71.76 |
40.74 |
35.10 |
60.00 |
73.33 |
mixtral-8x7b-instruct-v0.1-hf |
79.63 |
87.72 |
73.63 |
54.82 |
61.58 |
67.27 |
69.49 |
83.21 |
52.91 |
47.02 |
74.00 |
80.61 |
model |
business_ethics |
moral_disputes |
high_school_statistics |
miscellaneous |
formal_logic |
high_school_government_and_politics |
prehistory |
security_studies |
high_school_biology |
logical_fallacies |
high_school_world_history |
professional_medicine |
qwen1.5-0.5b-chat-hf |
45.00 |
41.04 |
30.09 |
39.21 |
24.60 |
35.23 |
33.95 |
25.31 |
36.13 |
31.29 |
49.37 |
38.24 |
qwen1.5-1.8b-chat-hf |
54.00 |
50.29 |
34.26 |
58.49 |
24.60 |
55.96 |
47.53 |
39.18 |
47.74 |
44.17 |
64.98 |
40.81 |
qwen1.5-4b-chat-hf |
61.00 |
64.16 |
46.30 |
71.01 |
39.68 |
72.02 |
54.01 |
65.31 |
63.55 |
63.80 |
71.31 |
51.10 |
qwen1.5-7b-chat-hf |
69.00 |
67.05 |
50.93 |
76.25 |
53.17 |
82.38 |
62.96 |
71.02 |
73.23 |
68.10 |
76.79 |
60.29 |
qwen1.5-14b-chat-hf |
74.00 |
75.14 |
58.33 |
82.89 |
51.59 |
88.60 |
69.44 |
77.96 |
84.19 |
73.62 |
82.70 |
71.32 |
qwen1.5-32b-chat-hf |
80.00 |
80.64 |
70.83 |
89.40 |
60.32 |
94.82 |
81.79 |
79.59 |
90.00 |
86.50 |
88.61 |
80.15 |
qwen1.5-72b-chat-hf |
80.00 |
82.95 |
68.98 |
91.83 |
57.14 |
98.45 |
86.73 |
78.78 |
89.03 |
87.12 |
91.14 |
83.82 |
qwen1.5-110b-chat-hf |
79.00 |
78.03 |
67.13 |
92.98 |
62.70 |
97.93 |
87.04 |
74.29 |
88.71 |
82.82 |
91.14 |
84.93 |
internlm2-chat-1.8b-hf |
48.00 |
49.13 |
44.91 |
57.60 |
26.98 |
61.14 |
50.62 |
51.02 |
52.58 |
57.67 |
67.51 |
37.50 |
internlm2-chat-1.8b-sft-hf |
50.00 |
49.13 |
44.91 |
57.73 |
28.57 |
61.66 |
49.69 |
51.02 |
49.68 |
57.67 |
66.67 |
38.60 |
internlm2-chat-7b-hf |
65.00 |
65.61 |
49.54 |
80.84 |
43.65 |
88.08 |
70.99 |
68.98 |
78.39 |
75.46 |
82.28 |
61.76 |
internlm2-chat-7b-sft-hf |
64.00 |
66.18 |
52.31 |
81.35 |
46.03 |
88.08 |
71.60 |
67.76 |
78.39 |
77.30 |
82.28 |
63.60 |
internlm2-chat-20b-hf |
74.00 |
73.70 |
59.72 |
81.86 |
46.83 |
89.12 |
74.69 |
75.92 |
80.65 |
79.14 |
82.70 |
70.59 |
internlm2-chat-20b-sft-hf |
76.00 |
73.12 |
60.19 |
81.99 |
43.65 |
88.60 |
74.38 |
73.88 |
80.32 |
80.37 |
82.70 |
70.59 |
llama-3-8b-instruct-hf |
72.00 |
73.12 |
55.09 |
84.55 |
50.00 |
90.67 |
77.16 |
77.55 |
81.61 |
77.91 |
84.81 |
75.00 |
llama-3-70b-instruct-hf |
85.00 |
85.26 |
75.00 |
92.72 |
69.05 |
97.41 |
90.43 |
82.04 |
91.61 |
87.12 |
94.09 |
89.71 |
llama-3-8b-instruct-lmdeploy |
72.00 |
72.83 |
52.78 |
82.12 |
51.59 |
89.64 |
76.85 |
76.73 |
80.97 |
76.69 |
84.39 |
74.63 |
llama-3-70b-instruct-lmdeploy |
85.00 |
84.39 |
73.61 |
92.72 |
67.46 |
97.93 |
89.81 |
81.63 |
90.65 |
87.12 |
93.25 |
89.34 |
mistral-7b-instruct-v0.1-hf |
55.00 |
57.51 |
39.81 |
74.07 |
39.68 |
75.65 |
57.72 |
62.04 |
59.35 |
69.33 |
67.93 |
55.88 |
mistral-7b-instruct-v0.2-hf |
61.00 |
66.76 |
46.76 |
78.67 |
36.51 |
84.97 |
68.83 |
70.20 |
68.39 |
69.33 |
73.00 |
58.09 |
mixtral-8x7b-instruct-v0.1-hf |
66.00 |
76.59 |
57.87 |
86.59 |
50.00 |
93.78 |
83.02 |
79.18 |
82.58 |
75.46 |
86.50 |
77.94 |
model |
high_school_mathematics |
college_medicine |
high_school_us_history |
sociology |
econometrics |
high_school_psychology |
human_aging |
us_foreign_policy |
qwen1.5-0.5b-chat-hf |
24.44 |
35.26 |
42.16 |
47.26 |
29.82 |
40.55 |
32.29 |
47.00 |
qwen1.5-1.8b-chat-hf |
32.22 |
43.35 |
54.90 |
48.26 |
28.95 |
61.83 |
48.43 |
71.00 |
qwen1.5-4b-chat-hf |
36.30 |
51.45 |
71.08 |
76.62 |
34.21 |
72.29 |
58.30 |
72.00 |
qwen1.5-7b-chat-hf |
31.11 |
61.27 |
76.47 |
79.10 |
42.11 |
81.28 |
61.43 |
83.00 |
qwen1.5-14b-chat-hf |
41.48 |
68.79 |
80.88 |
82.59 |
48.25 |
84.40 |
72.20 |
88.00 |
qwen1.5-32b-chat-hf |
48.52 |
75.72 |
88.73 |
86.07 |
57.02 |
90.46 |
78.03 |
95.00 |
qwen1.5-72b-chat-hf |
51.48 |
73.99 |
90.69 |
87.06 |
59.65 |
92.11 |
79.37 |
94.00 |
qwen1.5-110b-chat-hf |
52.22 |
76.30 |
93.14 |
87.56 |
62.28 |
91.56 |
80.27 |
88.00 |
internlm2-chat-1.8b-hf |
31.48 |
46.82 |
56.37 |
65.17 |
28.07 |
65.87 |
50.22 |
69.00 |
internlm2-chat-1.8b-sft-hf |
30.74 |
47.40 |
54.41 |
64.18 |
29.82 |
66.24 |
48.43 |
69.00 |
internlm2-chat-7b-hf |
33.70 |
67.05 |
79.90 |
81.09 |
48.25 |
84.04 |
67.26 |
84.00 |
internlm2-chat-7b-sft-hf |
35.19 |
67.05 |
79.90 |
80.60 |
48.25 |
84.59 |
65.47 |
85.00 |
internlm2-chat-20b-hf |
36.30 |
66.47 |
88.73 |
85.07 |
51.75 |
85.69 |
70.85 |
87.00 |
internlm2-chat-20b-sft-hf |
35.93 |
65.90 |
87.75 |
85.57 |
52.63 |
84.77 |
70.85 |
87.00 |
llama-3-8b-instruct-hf |
36.67 |
68.79 |
83.82 |
86.57 |
61.40 |
84.95 |
70.85 |
85.00 |
llama-3-70b-instruct-hf |
57.41 |
78.61 |
89.71 |
91.54 |
74.56 |
94.50 |
82.96 |
94.00 |
llama-3-8b-instruct-lmdeploy |
38.52 |
68.79 |
82.84 |
85.57 |
54.39 |
85.50 |
69.96 |
83.00 |
llama-3-70b-instruct-lmdeploy |
54.81 |
79.77 |
90.20 |
92.04 |
71.05 |
94.50 |
82.96 |
93.00 |
mistral-7b-instruct-v0.1-hf |
28.89 |
50.29 |
67.16 |
76.12 |
39.47 |
72.29 |
62.33 |
77.00 |
mistral-7b-instruct-v0.2-hf |
30.74 |
53.18 |
73.04 |
77.11 |
42.11 |
79.82 |
63.68 |
82.00 |
mixtral-8x7b-instruct-v0.1-hf |
35.56 |
73.41 |
85.29 |
87.06 |
60.53 |
86.97 |
74.44 |
86.00 |