mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
|
||
---|---|---|
.. | ||
mmlu_all_sets.py | ||
mmlu_clean_ppl.py | ||
mmlu_gen_4d595a.py | ||
mmlu_gen_5d1409.py | ||
mmlu_gen_23a9a9.py | ||
mmlu_gen_79e572.py | ||
mmlu_gen_a484b3.py | ||
mmlu_gen.py | ||
mmlu_model_postprocess_gen_4d595a.py | ||
mmlu_openai_simple_evals_gen_b618ea.py | ||
mmlu_ppl_ac766d.py | ||
mmlu_ppl.py | ||
mmlu_xfinder_gen_4d595a.py | ||
mmlu_zero_shot_gen_47e2c0.py | ||
README.md |
MMLU
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
Base Models
model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
---|---|---|---|---|---|
llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
Details
model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
Chat Models
model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
Details
model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |