BBH
python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
Base Models
model |
bbh |
llama-7b-turbomind |
33.34 |
llama-13b-turbomind |
37.99 |
llama-30b-turbomind |
49.86 |
llama-65b-turbomind |
58.26 |
llama-2-7b-turbomind |
38.27 |
llama-2-13b-turbomind |
45.68 |
llama-2-70b-turbomind |
64.78 |
llama-3-8b-turbomind |
59.69 |
llama-3-70b-turbomind |
79.16 |
internlm2-1.8b-turbomind |
36.03 |
internlm2-7b-turbomind |
63.56 |
internlm2-20b-turbomind |
71.29 |
qwen-1.8b-turbomind |
22.53 |
qwen-7b-turbomind |
45.89 |
qwen-14b-turbomind |
56.75 |
qwen-72b-turbomind |
63.35 |
qwen1.5-0.5b-hf |
20.54 |
qwen1.5-1.8b-hf |
27.01 |
qwen1.5-4b-hf |
34.81 |
qwen1.5-7b-hf |
39.87 |
qwen1.5-14b-hf |
50.38 |
qwen1.5-32b-hf |
67.47 |
qwen1.5-72b-hf |
58.81 |
qwen1.5-moe-a2-7b-hf |
39.46 |
mistral-7b-v0.1-hf |
56.71 |
mistral-7b-v0.2-hf |
57.32 |
mixtral-8x7b-v0.1-hf |
68.46 |
mixtral-8x22b-v0.1-hf |
79.48 |
yi-6b-hf |
44.82 |
yi-34b-hf |
66.37 |
deepseek-7b-base-hf |
42.88 |
deepseek-67b-base-hf |
71.86 |
Details
model |
temporal_sequences |
disambiguation_qa |
date_understanding |
tracking_shuffled_objects_three_objects |
penguins_in_a_table |
geometric_shapes |
snarks |
ruin_names |
tracking_shuffled_objects_seven_objects |
llama-7b-turbomind |
23.60 |
46.00 |
44.80 |
36.40 |
30.14 |
0.00 |
46.07 |
21.60 |
15.20 |
llama-13b-turbomind |
16.80 |
50.00 |
56.80 |
36.40 |
43.15 |
0.00 |
60.67 |
29.20 |
15.20 |
llama-30b-turbomind |
33.60 |
60.00 |
76.40 |
29.20 |
57.53 |
0.00 |
59.55 |
62.40 |
17.20 |
llama-65b-turbomind |
84.00 |
76.00 |
84.40 |
50.00 |
65.75 |
0.00 |
62.92 |
69.60 |
31.60 |
llama-2-7b-turbomind |
12.00 |
46.80 |
60.00 |
34.00 |
32.19 |
0.00 |
49.44 |
32.80 |
18.40 |
llama-2-13b-turbomind |
24.00 |
40.80 |
73.20 |
36.00 |
45.89 |
0.00 |
55.06 |
37.60 |
22.40 |
llama-2-70b-turbomind |
75.60 |
66.80 |
88.80 |
73.60 |
69.86 |
0.00 |
73.60 |
60.80 |
57.60 |
llama-3-8b-turbomind |
65.60 |
42.00 |
78.80 |
56.80 |
69.86 |
0.00 |
56.18 |
66.00 |
30.80 |
llama-3-70b-turbomind |
100.00 |
82.80 |
91.60 |
100.00 |
86.30 |
0.00 |
81.46 |
77.20 |
94.40 |
internlm2-1.8b-turbomind |
31.20 |
44.00 |
60.00 |
36.00 |
35.62 |
0.00 |
44.94 |
27.20 |
12.80 |
internlm2-7b-turbomind |
94.80 |
75.60 |
86.40 |
53.60 |
69.18 |
0.00 |
59.55 |
68.00 |
46.00 |
internlm2-20b-turbomind |
98.40 |
83.60 |
84.00 |
72.00 |
71.92 |
0.00 |
81.46 |
78.40 |
74.40 |
qwen-1.8b-turbomind |
26.40 |
39.60 |
33.20 |
28.40 |
28.08 |
0.00 |
44.94 |
21.60 |
12.40 |
qwen-7b-turbomind |
38.80 |
42.80 |
64.40 |
30.80 |
45.89 |
0.00 |
55.62 |
44.00 |
14.40 |
qwen-14b-turbomind |
57.60 |
59.20 |
67.20 |
46.40 |
67.12 |
0.00 |
51.12 |
63.60 |
30.40 |
qwen-72b-turbomind |
72.00 |
66.80 |
77.60 |
81.20 |
84.93 |
0.00 |
78.09 |
67.20 |
63.60 |
qwen1.5-0.5b-hf |
15.20 |
37.20 |
20.40 |
30.40 |
18.49 |
8.40 |
44.94 |
11.20 |
14.00 |
qwen1.5-1.8b-hf |
27.60 |
40.80 |
36.00 |
24.40 |
32.19 |
0.00 |
50.56 |
20.80 |
11.20 |
qwen1.5-4b-hf |
10.40 |
44.40 |
47.20 |
36.80 |
44.52 |
24.80 |
46.63 |
20.80 |
14.80 |
qwen1.5-7b-hf |
37.20 |
42.40 |
52.00 |
52.40 |
56.85 |
6.80 |
48.31 |
23.60 |
18.40 |
qwen1.5-14b-hf |
38.80 |
62.80 |
73.60 |
24.80 |
69.86 |
26.80 |
66.29 |
52.80 |
2.00 |
qwen1.5-32b-hf |
93.60 |
77.20 |
68.40 |
70.00 |
82.88 |
36.80 |
47.75 |
70.40 |
71.20 |
qwen1.5-72b-hf |
75.60 |
66.00 |
78.80 |
72.80 |
80.82 |
0.00 |
75.84 |
64.80 |
44.40 |
qwen1.5-moe-a2-7b-hf |
23.20 |
59.60 |
43.20 |
27.60 |
46.58 |
25.20 |
48.88 |
16.80 |
13.20 |
mistral-7b-v0.1-hf |
73.60 |
53.60 |
76.40 |
45.20 |
56.85 |
28.00 |
64.04 |
66.00 |
21.60 |
mistral-7b-v0.2-hf |
76.80 |
42.00 |
73.20 |
47.20 |
60.27 |
26.00 |
66.85 |
60.80 |
26.40 |
mixtral-8x7b-v0.1-hf |
89.60 |
70.80 |
84.80 |
81.20 |
70.55 |
25.60 |
66.29 |
71.20 |
58.80 |
mixtral-8x22b-v0.1-hf |
98.80 |
77.60 |
92.00 |
98.80 |
83.56 |
35.60 |
80.34 |
79.20 |
82.00 |
yi-6b-hf |
32.80 |
46.40 |
64.40 |
34.40 |
47.26 |
28.80 |
60.11 |
45.60 |
14.00 |
yi-34b-hf |
86.00 |
76.00 |
84.80 |
54.80 |
67.81 |
24.80 |
73.60 |
66.00 |
65.60 |
deepseek-7b-base-hf |
27.60 |
42.00 |
64.40 |
31.20 |
40.41 |
33.60 |
52.25 |
46.00 |
13.20 |
deepseek-67b-base-hf |
95.60 |
75.60 |
86.40 |
86.40 |
76.71 |
39.20 |
76.40 |
77.20 |
82.00 |
model |
tracking_shuffled_objects_five_objects |
logical_deduction_three_objects |
hyperbaton |
logical_deduction_five_objects |
logical_deduction_seven_objects |
movie_recommendation |
salient_translation_error_detection |
reasoning_about_colored_objects |
multistep_arithmetic_two |
llama-7b-turbomind |
18.40 |
42.80 |
58.00 |
23.20 |
13.20 |
40.00 |
16.40 |
30.40 |
0.00 |
llama-13b-turbomind |
16.00 |
48.80 |
53.60 |
30.40 |
16.40 |
61.60 |
11.20 |
44.80 |
0.80 |
llama-30b-turbomind |
22.40 |
66.40 |
73.20 |
43.60 |
31.60 |
84.40 |
43.60 |
57.60 |
2.80 |
llama-65b-turbomind |
41.60 |
79.20 |
74.40 |
48.40 |
39.20 |
91.20 |
40.40 |
67.20 |
20.00 |
llama-2-7b-turbomind |
17.20 |
54.80 |
51.60 |
32.80 |
23.60 |
74.40 |
19.60 |
45.60 |
1.20 |
llama-2-13b-turbomind |
23.20 |
63.60 |
52.40 |
46.00 |
42.00 |
68.00 |
21.60 |
62.00 |
2.00 |
llama-2-70b-turbomind |
72.40 |
86.40 |
84.40 |
55.20 |
43.20 |
95.60 |
50.80 |
76.80 |
20.80 |
llama-3-8b-turbomind |
40.80 |
76.40 |
93.20 |
45.20 |
36.80 |
88.80 |
53.60 |
72.80 |
30.80 |
llama-3-70b-turbomind |
99.20 |
94.00 |
98.00 |
58.40 |
42.80 |
93.60 |
63.60 |
88.40 |
79.20 |
internlm2-1.8b-turbomind |
16.80 |
47.60 |
63.60 |
21.60 |
12.00 |
69.20 |
16.80 |
45.20 |
5.60 |
internlm2-7b-turbomind |
51.20 |
78.80 |
90.40 |
52.00 |
41.20 |
95.60 |
58.80 |
74.40 |
44.40 |
internlm2-20b-turbomind |
81.20 |
95.60 |
83.60 |
62.40 |
48.00 |
94.80 |
57.60 |
75.60 |
72.80 |
qwen-1.8b-turbomind |
14.80 |
35.60 |
51.20 |
22.40 |
15.20 |
31.20 |
12.40 |
22.00 |
3.20 |
qwen-7b-turbomind |
20.80 |
54.80 |
76.00 |
37.60 |
27.60 |
74.80 |
41.20 |
57.60 |
23.60 |
qwen-14b-turbomind |
35.60 |
81.20 |
78.40 |
45.20 |
40.80 |
80.00 |
44.80 |
70.40 |
65.60 |
qwen-72b-turbomind |
66.40 |
89.20 |
90.40 |
60.00 |
50.80 |
81.60 |
56.40 |
88.00 |
70.40 |
qwen1.5-0.5b-hf |
20.00 |
34.80 |
46.80 |
18.80 |
15.60 |
24.40 |
15.20 |
16.00 |
1.20 |
qwen1.5-1.8b-hf |
18.00 |
32.80 |
66.00 |
18.80 |
11.20 |
24.80 |
13.60 |
27.60 |
4.80 |
qwen1.5-4b-hf |
18.40 |
56.40 |
56.80 |
30.00 |
20.80 |
40.80 |
46.80 |
44.80 |
41.20 |
qwen1.5-7b-hf |
32.40 |
58.40 |
67.20 |
36.00 |
28.00 |
62.80 |
49.20 |
60.40 |
48.00 |
qwen1.5-14b-hf |
7.20 |
78.40 |
75.20 |
41.20 |
27.60 |
74.40 |
46.00 |
81.60 |
8.00 |
qwen1.5-32b-hf |
71.60 |
88.40 |
97.60 |
58.80 |
46.40 |
68.00 |
51.60 |
88.40 |
66.80 |
qwen1.5-72b-hf |
61.20 |
88.40 |
96.00 |
60.40 |
49.20 |
86.40 |
34.80 |
86.80 |
53.60 |
qwen1.5-moe-a2-7b-hf |
22.80 |
49.20 |
68.00 |
28.40 |
22.40 |
58.40 |
40.80 |
42.00 |
33.60 |
mistral-7b-v0.1-hf |
30.40 |
79.60 |
70.80 |
54.40 |
42.80 |
77.60 |
47.20 |
70.00 |
30.40 |
mistral-7b-v0.2-hf |
32.80 |
74.00 |
77.60 |
48.00 |
40.40 |
84.00 |
49.20 |
76.00 |
35.20 |
mixtral-8x7b-v0.1-hf |
66.80 |
86.00 |
94.80 |
50.40 |
40.40 |
86.40 |
53.20 |
82.80 |
60.80 |
mixtral-8x22b-v0.1-hf |
87.60 |
95.20 |
99.60 |
70.00 |
54.00 |
95.20 |
58.40 |
95.20 |
82.00 |
yi-6b-hf |
17.20 |
49.20 |
72.40 |
34.40 |
28.00 |
76.80 |
32.40 |
56.80 |
9.20 |
yi-34b-hf |
67.20 |
85.60 |
79.60 |
49.20 |
39.60 |
86.80 |
56.00 |
81.20 |
33.20 |
deepseek-7b-base-hf |
17.60 |
51.20 |
72.40 |
28.80 |
20.00 |
78.40 |
28.80 |
46.80 |
1.60 |
deepseek-67b-base-hf |
82.40 |
90.00 |
78.80 |
60.40 |
44.80 |
88.80 |
56.80 |
86.40 |
38.00 |
model |
navigate |
dyck_languages |
word_sorting |
sports_understanding |
boolean_expressions |
object_counting |
formal_fallacies |
causal_judgement |
web_of_lies |
llama-7b-turbomind |
45.20 |
1.60 |
8.40 |
81.60 |
66.00 |
47.20 |
46.00 |
40.64 |
57.20 |
llama-13b-turbomind |
59.20 |
0.80 |
14.40 |
76.40 |
69.20 |
46.40 |
47.20 |
53.48 |
66.80 |
llama-30b-turbomind |
64.80 |
2.40 |
17.20 |
93.60 |
78.40 |
71.20 |
43.20 |
55.61 |
98.40 |
llama-65b-turbomind |
72.40 |
6.80 |
21.60 |
98.80 |
81.60 |
70.00 |
40.80 |
55.61 |
99.60 |
llama-2-7b-turbomind |
54.40 |
1.20 |
10.80 |
88.80 |
68.40 |
49.20 |
48.40 |
52.41 |
53.20 |
llama-2-13b-turbomind |
74.40 |
2.80 |
18.80 |
97.60 |
74.40 |
52.80 |
46.40 |
54.55 |
96.00 |
llama-2-70b-turbomind |
82.40 |
13.60 |
30.40 |
98.40 |
81.60 |
83.20 |
43.60 |
63.64 |
100.00 |
llama-3-8b-turbomind |
90.00 |
9.20 |
38.80 |
95.20 |
87.60 |
84.80 |
51.20 |
50.27 |
100.00 |
llama-3-70b-turbomind |
96.80 |
48.40 |
48.80 |
99.60 |
92.40 |
99.60 |
62.40 |
58.29 |
100.00 |
internlm2-1.8b-turbomind |
64.40 |
0.40 |
3.20 |
66.40 |
54.00 |
50.00 |
49.20 |
48.13 |
46.80 |
internlm2-7b-turbomind |
78.80 |
2.40 |
35.20 |
95.60 |
85.60 |
75.60 |
48.00 |
63.10 |
92.00 |
internlm2-20b-turbomind |
88.80 |
15.60 |
36.00 |
96.80 |
88.80 |
76.00 |
50.40 |
56.68 |
100.00 |
qwen-1.8b-turbomind |
50.00 |
0.00 |
0.80 |
62.80 |
29.20 |
2.40 |
6.00 |
12.83 |
1.60 |
qwen-7b-turbomind |
62.80 |
1.60 |
18.00 |
81.60 |
75.20 |
68.80 |
50.00 |
63.64 |
66.80 |
qwen-14b-turbomind |
75.60 |
1.20 |
26.80 |
88.80 |
80.40 |
74.40 |
50.00 |
53.48 |
96.80 |
qwen-72b-turbomind |
56.00 |
14.40 |
35.20 |
87.60 |
91.60 |
81.60 |
5.60 |
31.55 |
62.40 |
qwen1.5-0.5b-hf |
25.60 |
0.00 |
0.40 |
41.60 |
51.60 |
16.80 |
4.40 |
1.07 |
20.00 |
qwen1.5-1.8b-hf |
55.60 |
0.00 |
1.60 |
63.60 |
55.20 |
47.60 |
4.40 |
28.88 |
11.20 |
qwen1.5-4b-hf |
61.60 |
0.40 |
8.80 |
0.80 |
76.00 |
54.40 |
0.80 |
28.34 |
62.40 |
qwen1.5-7b-hf |
63.60 |
2.40 |
20.80 |
72.40 |
69.60 |
26.80 |
0.00 |
40.64 |
0.00 |
qwen1.5-14b-hf |
82.40 |
1.20 |
27.60 |
78.40 |
87.20 |
48.00 |
54.00 |
24.06 |
100.00 |
qwen1.5-32b-hf |
86.80 |
5.60 |
36.80 |
90.00 |
86.40 |
66.40 |
35.60 |
62.57 |
95.60 |
qwen1.5-72b-hf |
48.40 |
13.20 |
34.40 |
87.60 |
8.00 |
67.60 |
13.60 |
39.57 |
99.60 |
qwen1.5-moe-a2-7b-hf |
56.80 |
2.00 |
8.80 |
79.60 |
73.60 |
66.80 |
4.00 |
53.48 |
50.40 |
mistral-7b-v0.1-hf |
73.60 |
4.00 |
26.40 |
97.20 |
82.00 |
67.60 |
43.20 |
48.66 |
100.00 |
mistral-7b-v0.2-hf |
72.80 |
4.00 |
30.40 |
97.20 |
81.20 |
66.80 |
46.00 |
52.41 |
100.00 |
mixtral-8x7b-v0.1-hf |
85.60 |
18.80 |
33.60 |
98.00 |
90.80 |
85.20 |
49.60 |
55.61 |
90.80 |
mixtral-8x22b-v0.1-hf |
92.80 |
51.60 |
40.00 |
98.40 |
91.60 |
95.60 |
54.80 |
56.15 |
100.00 |
yi-6b-hf |
66.40 |
1.20 |
16.00 |
92.80 |
59.60 |
53.20 |
53.20 |
52.41 |
65.20 |
yi-34b-hf |
81.20 |
18.80 |
36.40 |
97.60 |
85.60 |
84.00 |
51.20 |
59.89 |
99.60 |
deepseek-7b-base-hf |
59.20 |
3.20 |
6.40 |
92.00 |
73.20 |
49.60 |
50.80 |
52.41 |
74.80 |
deepseek-67b-base-hf |
85.20 |
30.00 |
33.20 |
99.60 |
84.80 |
82.40 |
46.80 |
56.68 |
99.60 |
Chat Models
model |
bbh |
qwen1.5-0.5b-chat-hf |
24.12 |
qwen1.5-1.8b-chat-hf |
26.82 |
qwen1.5-4b-chat-hf |
43.15 |
qwen1.5-7b-chat-hf |
38.12 |
qwen1.5-14b-chat-hf |
55.38 |
qwen1.5-32b-chat-hf |
69.28 |
qwen1.5-72b-chat-hf |
72.97 |
qwen1.5-110b-chat-hf |
71.04 |
internlm2-chat-1.8b-hf |
37.69 |
internlm2-chat-1.8b-sft-hf |
37.12 |
internlm2-chat-7b-hf |
57.83 |
internlm2-chat-7b-sft-hf |
57.19 |
internlm2-chat-20b-hf |
68.24 |
internlm2-chat-20b-sft-hf |
69.38 |
llama-3-8b-instruct-hf |
52.85 |
llama-3-70b-instruct-hf |
82.42 |
llama-3-8b-instruct-lmdeploy |
53.54 |
llama-3-70b-instruct-lmdeploy |
82.58 |
mistral-7b-instruct-v0.1-hf |
32.88 |
mistral-7b-instruct-v0.2-hf |
48.84 |
mixtral-8x7b-instruct-v0.1-hf |
59.64 |
Details
model |
temporal_sequences |
disambiguation_qa |
date_understanding |
tracking_shuffled_objects_three_objects |
penguins_in_a_table |
geometric_shapes |
snarks |
ruin_names |
tracking_shuffled_objects_seven_objects |
qwen1.5-0.5b-chat-hf |
25.60 |
42.00 |
20.00 |
31.20 |
15.07 |
14.40 |
46.07 |
24.80 |
13.20 |
qwen1.5-1.8b-chat-hf |
28.80 |
36.00 |
30.40 |
35.20 |
19.18 |
7.60 |
46.63 |
24.00 |
9.60 |
qwen1.5-4b-chat-hf |
8.00 |
56.00 |
64.80 |
28.40 |
48.63 |
19.60 |
60.67 |
34.00 |
14.40 |
qwen1.5-7b-chat-hf |
39.60 |
37.60 |
62.40 |
36.80 |
60.96 |
30.80 |
54.49 |
38.00 |
20.00 |
qwen1.5-14b-chat-hf |
61.60 |
63.60 |
70.00 |
54.00 |
74.66 |
33.60 |
67.42 |
61.20 |
35.60 |
qwen1.5-32b-chat-hf |
94.40 |
77.60 |
78.00 |
66.00 |
93.84 |
46.00 |
82.58 |
73.60 |
61.60 |
qwen1.5-72b-chat-hf |
70.40 |
72.40 |
84.40 |
67.20 |
89.73 |
52.00 |
79.21 |
86.40 |
68.80 |
qwen1.5-110b-chat-hf |
74.80 |
71.20 |
82.80 |
74.80 |
89.04 |
48.00 |
90.45 |
87.60 |
73.60 |
internlm2-chat-1.8b-hf |
35.60 |
52.40 |
48.80 |
29.60 |
39.73 |
24.40 |
51.69 |
27.20 |
13.20 |
internlm2-chat-1.8b-sft-hf |
37.20 |
53.60 |
44.00 |
30.00 |
34.93 |
22.40 |
56.74 |
28.00 |
12.00 |
internlm2-chat-7b-hf |
72.00 |
66.40 |
73.60 |
65.20 |
60.27 |
50.00 |
62.92 |
52.40 |
44.40 |
internlm2-chat-7b-sft-hf |
67.20 |
66.80 |
58.00 |
63.20 |
48.63 |
45.60 |
64.04 |
59.60 |
42.80 |
internlm2-chat-20b-hf |
80.40 |
76.00 |
77.60 |
88.80 |
78.08 |
36.40 |
71.91 |
71.60 |
77.20 |
internlm2-chat-20b-sft-hf |
80.00 |
70.80 |
78.00 |
87.60 |
82.88 |
41.20 |
76.40 |
72.80 |
71.60 |
llama-3-8b-instruct-hf |
70.40 |
42.80 |
28.40 |
81.20 |
13.01 |
49.20 |
44.94 |
73.20 |
42.40 |
llama-3-70b-instruct-hf |
100.00 |
84.00 |
91.60 |
95.60 |
78.08 |
52.40 |
87.08 |
89.60 |
97.60 |
llama-3-8b-instruct-lmdeploy |
73.20 |
45.60 |
34.00 |
79.60 |
31.51 |
48.40 |
47.75 |
76.80 |
47.60 |
llama-3-70b-instruct-lmdeploy |
100.00 |
84.00 |
90.00 |
96.80 |
83.56 |
56.00 |
87.08 |
89.20 |
97.20 |
mistral-7b-instruct-v0.1-hf |
32.00 |
22.40 |
52.40 |
35.20 |
30.82 |
23.20 |
38.76 |
46.00 |
18.40 |
mistral-7b-instruct-v0.2-hf |
66.00 |
58.40 |
50.40 |
48.40 |
48.63 |
37.20 |
65.73 |
40.40 |
29.20 |
mixtral-8x7b-instruct-v0.1-hf |
63.20 |
68.40 |
65.20 |
60.00 |
78.08 |
40.40 |
74.16 |
64.00 |
46.00 |
model |
tracking_shuffled_objects_five_objects |
logical_deduction_three_objects |
hyperbaton |
logical_deduction_five_objects |
logical_deduction_seven_objects |
movie_recommendation |
salient_translation_error_detection |
reasoning_about_colored_objects |
multistep_arithmetic_two |
qwen1.5-0.5b-chat-hf |
20.40 |
34.40 |
51.60 |
21.20 |
13.20 |
26.00 |
20.80 |
17.20 |
1.20 |
qwen1.5-1.8b-chat-hf |
18.00 |
34.80 |
48.40 |
21.20 |
16.40 |
34.80 |
24.00 |
28.80 |
4.40 |
qwen1.5-4b-chat-hf |
19.20 |
56.80 |
65.20 |
36.40 |
35.60 |
51.60 |
40.40 |
55.20 |
29.20 |
qwen1.5-7b-chat-hf |
31.60 |
58.80 |
53.20 |
35.60 |
27.20 |
56.00 |
44.80 |
62.00 |
50.00 |
qwen1.5-14b-chat-hf |
43.20 |
75.20 |
52.80 |
52.40 |
50.80 |
76.40 |
48.80 |
83.60 |
65.20 |
qwen1.5-32b-chat-hf |
68.40 |
84.00 |
81.20 |
57.20 |
46.00 |
78.80 |
54.40 |
86.00 |
86.00 |
qwen1.5-72b-chat-hf |
76.80 |
94.40 |
85.20 |
62.80 |
54.00 |
78.40 |
63.60 |
86.40 |
82.80 |
qwen1.5-110b-chat-hf |
79.20 |
91.60 |
88.80 |
61.20 |
50.00 |
82.40 |
59.60 |
88.80 |
78.00 |
internlm2-chat-1.8b-hf |
20.00 |
48.40 |
56.00 |
24.40 |
26.80 |
65.20 |
18.00 |
39.60 |
7.60 |
internlm2-chat-1.8b-sft-hf |
18.40 |
48.00 |
51.20 |
20.40 |
25.20 |
63.20 |
22.00 |
38.80 |
6.00 |
internlm2-chat-7b-hf |
48.40 |
75.20 |
84.80 |
42.00 |
36.80 |
79.60 |
53.20 |
65.60 |
26.40 |
internlm2-chat-7b-sft-hf |
44.00 |
72.40 |
85.60 |
41.60 |
37.20 |
82.40 |
55.60 |
52.80 |
32.00 |
internlm2-chat-20b-hf |
88.00 |
88.80 |
88.80 |
52.80 |
50.40 |
85.20 |
56.80 |
79.60 |
40.00 |
internlm2-chat-20b-sft-hf |
83.20 |
90.00 |
90.40 |
55.60 |
48.80 |
84.40 |
57.60 |
79.20 |
38.40 |
llama-3-8b-instruct-hf |
49.60 |
85.60 |
76.00 |
54.00 |
29.20 |
57.60 |
46.00 |
44.80 |
52.00 |
llama-3-70b-instruct-hf |
99.20 |
96.80 |
95.20 |
77.20 |
65.20 |
80.00 |
69.60 |
94.80 |
84.00 |
llama-3-8b-instruct-lmdeploy |
57.20 |
78.00 |
75.60 |
36.00 |
13.20 |
59.20 |
53.60 |
54.80 |
52.80 |
llama-3-70b-instruct-lmdeploy |
98.80 |
96.40 |
96.80 |
75.20 |
68.80 |
79.60 |
67.60 |
94.00 |
84.80 |
mistral-7b-instruct-v0.1-hf |
26.00 |
46.00 |
60.00 |
38.00 |
24.00 |
59.20 |
1.20 |
6.00 |
12.40 |
mistral-7b-instruct-v0.2-hf |
39.60 |
63.60 |
64.00 |
44.00 |
33.20 |
56.00 |
42.40 |
68.40 |
14.00 |
mixtral-8x7b-instruct-v0.1-hf |
46.40 |
71.60 |
88.80 |
48.00 |
36.80 |
60.00 |
50.00 |
81.20 |
59.20 |
model |
navigate |
dyck_languages |
word_sorting |
sports_understanding |
boolean_expressions |
object_counting |
formal_fallacies |
causal_judgement |
web_of_lies |
qwen1.5-0.5b-chat-hf |
45.60 |
0.00 |
1.20 |
17.20 |
50.40 |
16.40 |
11.60 |
42.78 |
27.60 |
qwen1.5-1.8b-chat-hf |
58.40 |
0.00 |
2.00 |
34.00 |
44.80 |
30.40 |
11.60 |
24.60 |
50.00 |
qwen1.5-4b-chat-hf |
64.00 |
3.20 |
6.80 |
80.40 |
77.60 |
48.80 |
41.20 |
55.61 |
63.20 |
qwen1.5-7b-chat-hf |
54.40 |
0.40 |
8.00 |
55.60 |
47.60 |
31.20 |
0.00 |
2.14 |
30.00 |
qwen1.5-14b-chat-hf |
74.40 |
6.40 |
26.40 |
72.40 |
76.40 |
61.60 |
0.80 |
25.67 |
81.20 |
qwen1.5-32b-chat-hf |
90.00 |
10.40 |
28.40 |
82.40 |
92.80 |
76.80 |
32.40 |
41.71 |
100.00 |
qwen1.5-72b-chat-hf |
81.20 |
18.40 |
37.60 |
95.20 |
92.80 |
76.00 |
50.40 |
63.64 |
100.00 |
qwen1.5-110b-chat-hf |
91.60 |
18.00 |
39.60 |
82.80 |
80.80 |
75.20 |
22.40 |
35.83 |
100.00 |
internlm2-chat-1.8b-hf |
63.20 |
0.00 |
6.00 |
58.00 |
56.80 |
48.80 |
54.80 |
52.94 |
48.40 |
internlm2-chat-1.8b-sft-hf |
63.20 |
0.00 |
5.60 |
58.00 |
56.80 |
50.00 |
52.40 |
56.68 |
47.60 |
internlm2-chat-7b-hf |
73.60 |
3.60 |
18.00 |
55.20 |
83.60 |
62.80 |
50.00 |
58.29 |
97.20 |
internlm2-chat-7b-sft-hf |
71.60 |
4.40 |
20.00 |
82.00 |
84.00 |
60.00 |
51.60 |
52.94 |
98.00 |
internlm2-chat-20b-hf |
82.40 |
8.00 |
36.00 |
55.60 |
84.40 |
78.00 |
50.40 |
59.36 |
100.00 |
internlm2-chat-20b-sft-hf |
81.60 |
10.40 |
36.40 |
89.20 |
82.40 |
80.40 |
48.40 |
55.61 |
100.00 |
llama-3-8b-instruct-hf |
82.80 |
8.80 |
37.20 |
94.40 |
78.80 |
89.60 |
45.20 |
24.06 |
25.60 |
llama-3-70b-instruct-hf |
95.20 |
18.80 |
49.20 |
98.00 |
94.00 |
90.00 |
73.20 |
68.98 |
100.00 |
llama-3-8b-instruct-lmdeploy |
83.60 |
10.00 |
40.40 |
96.00 |
77.20 |
89.20 |
43.60 |
37.43 |
3.20 |
llama-3-70b-instruct-lmdeploy |
95.60 |
22.40 |
48.80 |
96.80 |
91.60 |
87.20 |
72.00 |
69.52 |
100.00 |
mistral-7b-instruct-v0.1-hf |
70.80 |
0.80 |
5.20 |
68.80 |
69.60 |
51.60 |
3.20 |
12.30 |
33.60 |
mistral-7b-instruct-v0.2-hf |
62.40 |
4.00 |
15.60 |
81.20 |
70.40 |
50.40 |
32.00 |
34.76 |
98.40 |
mixtral-8x7b-instruct-v0.1-hf |
76.40 |
12.80 |
23.20 |
55.20 |
85.60 |
83.60 |
40.00 |
43.32 |
88.80 |