mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
94 KiB
94 KiB
C-Eval
Base Models
model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 26.61 | 27.75 | 27.20 | 26.31 | 25.90 | 26.52 | 27.44 | 27.68 | 27.16 | 29.49 | 24.18 | 29.36 |
llama-13b-turbomind | 29.18 | 25.59 | 27.66 | 33.86 | 28.29 | 28.58 | 31.75 | 30.32 | 31.39 | 35.22 | 30.16 | 30.82 |
llama-30b-turbomind | 35.09 | 31.68 | 34.56 | 39.89 | 33.02 | 33.76 | 37.70 | 31.97 | 34.80 | 42.72 | 41.19 | 34.93 |
llama-65b-turbomind | 37.98 | 29.47 | 36.03 | 45.03 | 36.51 | 36.56 | 40.46 | 33.76 | 36.37 | 46.47 | 42.26 | 40.63 |
llama-2-7b-turbomind | 30.13 | 26.26 | 29.29 | 33.02 | 31.02 | 28.15 | 32.70 | 25.85 | 28.75 | 39.75 | 37.04 | 29.13 |
llama-2-13b-turbomind | 37.38 | 30.81 | 35.85 | 43.98 | 36.81 | 34.75 | 40.43 | 31.34 | 35.67 | 45.75 | 45.32 | 39.36 |
llama-2-70b-turbomind | 49.53 | 33.48 | 44.73 | 60.19 | 50.93 | 47.17 | 50.26 | 32.53 | 44.83 | 59.44 | 54.45 | 47.58 |
llama-3-8b-turbomind | 48.83 | 34.47 | 46.02 | 56.48 | 49.15 | 46.69 | 50.45 | 33.76 | 45.94 | 58.08 | 50.93 | 51.25 |
llama-3-70b-turbomind | 66.56 | 54.09 | 64.08 | 76.43 | 64.38 | 64.25 | 67.30 | 52.35 | 62.67 | 77.89 | 69.76 | 63.65 |
internlm2-1.8b-turbomind | 44.79 | 33.93 | 41.19 | 54.26 | 47.15 | 40.35 | 46.64 | 33.00 | 38.62 | 57.28 | 51.30 | 46.89 |
internlm2-7b-turbomind | 63.54 | 45.32 | 58.10 | 76.40 | 66.94 | 58.32 | 64.23 | 40.09 | 54.37 | 76.88 | 70.11 | 64.77 |
internlm2-20b-turbomind | 67.28 | 50.15 | 62.33 | 79.59 | 70.55 | 61.82 | 66.73 | 42.50 | 59.25 | 79.98 | 73.43 | 61.56 |
qwen-1.8b-turbomind | 54.24 | 38.60 | 50.02 | 68.18 | 55.33 | 48.13 | 53.78 | 33.38 | 46.36 | 68.40 | 57.57 | 50.17 |
qwen-7b-turbomind | 62.06 | 42.73 | 56.21 | 77.12 | 65.28 | 55.76 | 63.23 | 36.99 | 54.74 | 78.55 | 68.94 | 59.02 |
qwen-14b-turbomind | 70.33 | 53.61 | 65.25 | 83.19 | 72.85 | 65.37 | 72.05 | 55.03 | 66.07 | 85.59 | 74.91 | 67.78 |
qwen-72b-turbomind | 83.25 | 66.78 | 78.44 | 91.75 | 83.86 | 83.63 | 83.60 | 63.68 | 78.05 | 90.25 | 87.13 | 84.13 |
qwen1.5-0.5b-hf | 48.36 | 35.55 | 44.72 | 62.00 | 48.51 | 42.41 | 50.43 | 37.00 | 46.28 | 62.64 | 48.11 | 49.18 |
qwen1.5-1.8b-hf | 58.67 | 40.98 | 53.91 | 74.52 | 58.51 | 53.06 | 59.38 | 43.02 | 53.45 | 75.88 | 60.06 | 54.47 |
qwen1.5-4b-hf | 66.55 | 48.50 | 61.45 | 81.12 | 67.90 | 61.22 | 66.46 | 43.12 | 56.76 | 82.89 | 67.61 | 68.03 |
qwen1.5-7b-hf | 72.49 | 52.90 | 66.77 | 85.50 | 74.37 | 69.19 | 73.57 | 49.16 | 66.32 | 84.23 | 77.30 | 73.34 |
qwen1.5-14b-hf | 76.93 | 60.50 | 72.08 | 88.81 | 77.95 | 73.94 | 77.86 | 54.81 | 71.55 | 86.79 | 82.86 | 76.23 |
qwen1.5-32b-hf | 82.50 | 66.67 | 77.97 | 90.93 | 83.66 | 81.88 | 82.79 | 71.06 | 80.01 | 89.02 | 83.36 | 81.62 |
qwen1.5-72b-hf | 83.03 | 65.09 | 77.90 | 91.47 | 83.85 | 83.86 | 83.72 | 64.09 | 77.26 | 91.87 | 87.64 | 84.14 |
qwen1.5-moe-a2-7b-hf | 76.67 | 51.37 | 68.89 | 88.33 | 77.15 | 79.73 | 77.90 | 51.25 | 67.27 | 89.28 | 83.16 | 81.60 |
mistral-7b-v0.1-hf | 43.76 | 33.85 | 42.23 | 49.97 | 41.10 | 43.54 | 47.54 | 33.97 | 44.74 | 54.80 | 51.52 | 42.06 |
mistral-7b-v0.2-hf | 42.81 | 32.84 | 41.00 | 50.19 | 39.45 | 42.77 | 46.44 | 31.67 | 42.89 | 54.50 | 48.75 | 43.23 |
mixtral-8x7b-v0.1-hf | 51.15 | 41.46 | 50.93 | 59.19 | 46.69 | 48.72 | 55.31 | 42.04 | 52.78 | 62.00 | 56.44 | 52.71 |
mixtral-8x22b-v0.1-hf | 58.13 | 48.31 | 58.01 | 66.94 | 53.60 | 54.86 | 60.50 | 45.67 | 57.44 | 71.27 | 61.31 | 55.47 |
yi-6b-hf | 70.78 | 43.72 | 60.54 | 83.29 | 75.39 | 73.40 | 73.13 | 46.87 | 63.14 | 85.52 | 78.70 | 74.45 |
yi-34b-hf | 80.93 | 58.51 | 73.48 | 89.24 | 83.65 | 84.18 | 81.62 | 56.95 | 71.64 | 89.73 | 87.49 | 86.53 |
deepseek-7b-base-hf | 43.68 | 28.90 | 37.03 | 53.55 | 50.14 | 40.34 | 45.07 | 31.94 | 38.81 | 56.68 | 47.10 | 43.85 |
deepseek-67b-base-hf | 66.66 | 44.25 | 57.89 | 79.02 | 72.36 | 65.66 | 66.65 | 38.62 | 56.65 | 79.56 | 73.72 | 66.01 |
Details on Test Split
model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 29.82 | 25.70 | 26.94 | 30.99 | 32.95 | 23.66 | 26.01 | 22.89 | 27.45 | 30.09 | 26.48 | 33.13 |
llama-13b-turbomind | 33.33 | 37.99 | 31.09 | 29.82 | 22.16 | 27.23 | 31.79 | 27.11 | 24.84 | 28.02 | 33.33 | 30.72 |
llama-30b-turbomind | 40.94 | 48.60 | 40.41 | 34.21 | 32.95 | 35.71 | 36.42 | 32.53 | 27.45 | 31.56 | 36.07 | 30.12 |
llama-65b-turbomind | 41.52 | 50.84 | 44.04 | 40.94 | 27.84 | 29.46 | 28.32 | 30.72 | 29.41 | 35.10 | 42.47 | 30.12 |
llama-2-7b-turbomind | 33.92 | 37.99 | 34.72 | 30.99 | 26.70 | 21.88 | 31.79 | 25.30 | 24.18 | 31.56 | 39.73 | 30.12 |
llama-2-13b-turbomind | 40.94 | 46.93 | 37.82 | 36.26 | 30.68 | 29.46 | 35.84 | 30.72 | 24.84 | 32.74 | 42.92 | 34.94 |
llama-2-70b-turbomind | 55.56 | 58.66 | 53.89 | 47.95 | 34.09 | 33.48 | 32.95 | 27.11 | 34.64 | 37.76 | 57.99 | 29.52 |
llama-3-8b-turbomind | 55.56 | 58.66 | 55.96 | 51.17 | 27.27 | 35.27 | 36.42 | 31.33 | 34.64 | 40.12 | 50.68 | 30.72 |
llama-3-70b-turbomind | 69.59 | 75.98 | 69.95 | 71.64 | 49.43 | 58.04 | 52.02 | 53.01 | 58.82 | 45.72 | 68.95 | 40.96 |
internlm2-1.8b-turbomind | 40.35 | 40.78 | 39.38 | 32.16 | 34.66 | 34.38 | 31.21 | 31.33 | 35.95 | 35.10 | 51.60 | 27.71 |
internlm2-7b-turbomind | 56.14 | 57.54 | 62.69 | 49.42 | 43.75 | 48.21 | 34.68 | 32.53 | 33.33 | 41.00 | 60.27 | 40.36 |
internlm2-20b-turbomind | 62.57 | 65.36 | 66.84 | 58.77 | 43.18 | 51.79 | 39.31 | 40.36 | 35.95 | 42.77 | 66.67 | 47.59 |
qwen-1.8b-turbomind | 46.20 | 41.90 | 46.63 | 36.84 | 40.34 | 36.61 | 27.75 | 28.92 | 32.68 | 36.58 | 57.08 | 30.12 |
qwen-7b-turbomind | 52.63 | 54.75 | 54.40 | 46.20 | 35.80 | 44.20 | 36.99 | 27.71 | 26.80 | 38.35 | 57.99 | 33.13 |
qwen-14b-turbomind | 58.48 | 64.80 | 59.07 | 54.68 | 45.45 | 57.59 | 45.09 | 33.73 | 39.22 | 49.26 | 67.58 | 45.78 |
qwen-72b-turbomind | 83.04 | 73.74 | 79.27 | 76.61 | 75.00 | 64.29 | 49.13 | 44.58 | 46.41 | 66.37 | 85.84 | 68.07 |
qwen1.5-0.5b-hf | 37.43 | 40.22 | 41.45 | 35.09 | 40.91 | 34.82 | 30.06 | 27.11 | 26.80 | 29.79 | 54.34 | 31.93 |
qwen1.5-1.8b-hf | 47.37 | 50.84 | 47.67 | 38.30 | 43.18 | 35.27 | 29.48 | 30.12 | 33.99 | 39.53 | 58.90 | 28.92 |
qwen1.5-4b-hf | 62.57 | 56.98 | 56.99 | 46.78 | 48.30 | 45.98 | 40.46 | 34.34 | 31.37 | 46.61 | 62.10 | 43.37 |
qwen1.5-7b-hf | 66.08 | 62.57 | 66.32 | 55.56 | 54.55 | 47.77 | 41.62 | 31.93 | 35.95 | 49.85 | 74.43 | 49.40 |
qwen1.5-14b-hf | 71.35 | 66.48 | 68.39 | 64.91 | 57.95 | 65.62 | 41.62 | 40.36 | 47.71 | 56.64 | 79.45 | 56.63 |
qwen1.5-32b-hf | 84.80 | 73.18 | 74.61 | 70.18 | 71.59 | 61.61 | 49.13 | 45.78 | 49.02 | 61.95 | 87.67 | 72.89 |
qwen1.5-72b-hf | 85.38 | 73.74 | 78.24 | 78.36 | 72.73 | 63.39 | 43.35 | 40.96 | 49.02 | 65.78 | 85.84 | 66.27 |
qwen1.5-moe-a2-7b-hf | 77.78 | 73.74 | 68.91 | 64.91 | 66.48 | 49.11 | 33.53 | 36.75 | 35.95 | 61.06 | 91.32 | 40.96 |
mistral-7b-v0.1-hf | 55.56 | 55.31 | 56.99 | 48.25 | 39.77 | 39.29 | 33.53 | 25.90 | 31.37 | 35.99 | 45.21 | 27.11 |
mistral-7b-v0.2-hf | 56.14 | 53.63 | 55.44 | 47.66 | 36.36 | 34.38 | 32.37 | 25.30 | 33.33 | 31.86 | 45.21 | 29.52 |
mixtral-8x7b-v0.1-hf | 62.57 | 64.80 | 60.10 | 60.53 | 38.64 | 42.41 | 40.46 | 37.35 | 45.75 | 35.99 | 60.27 | 34.94 |
mixtral-8x22b-v0.1-hf | 65.50 | 74.86 | 63.73 | 65.79 | 46.59 | 52.68 | 52.02 | 45.78 | 52.94 | 42.77 | 62.56 | 39.16 |
yi-6b-hf | 68.42 | 63.13 | 69.43 | 57.89 | 42.05 | 48.66 | 31.79 | 33.13 | 28.76 | 49.85 | 74.89 | 37.35 |
yi-34b-hf | 83.63 | 80.45 | 74.09 | 68.42 | 62.50 | 60.27 | 45.09 | 38.55 | 50.33 | 65.19 | 88.58 | 49.40 |
deepseek-7b-base-hf | 44.44 | 44.13 | 44.56 | 36.26 | 30.68 | 29.02 | 32.37 | 24.70 | 26.14 | 35.99 | 48.86 | 28.31 |
deepseek-67b-base-hf | 63.16 | 70.39 | 65.80 | 59.36 | 42.61 | 45.54 | 35.84 | 38.55 | 42.48 | 44.54 | 68.95 | 33.73 |
model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 29.14 | 26.74 | 24.57 | 29.94 | 22.92 | 23.60 | 20.00 | 30.95 | 29.98 | 24.58 | 25.70 | 25.11 |
llama-13b-turbomind | 22.29 | 18.60 | 28.00 | 26.55 | 26.56 | 25.28 | 19.46 | 29.05 | 28.77 | 28.57 | 39.66 | 43.38 |
llama-30b-turbomind | 25.14 | 33.14 | 36.00 | 31.07 | 39.06 | 28.09 | 33.51 | 38.10 | 35.21 | 35.88 | 48.04 | 33.33 |
llama-65b-turbomind | 33.71 | 26.16 | 38.29 | 33.90 | 44.27 | 36.52 | 38.92 | 38.10 | 37.42 | 42.19 | 59.22 | 48.40 |
llama-2-7b-turbomind | 26.86 | 23.26 | 26.86 | 28.81 | 28.12 | 29.78 | 22.70 | 30.48 | 31.79 | 30.56 | 33.52 | 36.07 |
llama-2-13b-turbomind | 28.00 | 31.98 | 36.57 | 36.72 | 38.54 | 36.52 | 37.84 | 46.67 | 37.02 | 36.54 | 57.54 | 41.10 |
llama-2-70b-turbomind | 40.00 | 36.05 | 48.00 | 36.72 | 66.67 | 55.06 | 55.68 | 52.86 | 51.91 | 48.50 | 68.16 | 60.73 |
llama-3-8b-turbomind | 41.71 | 38.37 | 50.86 | 36.16 | 61.98 | 63.48 | 63.78 | 56.19 | 41.65 | 49.17 | 69.27 | 54.34 |
llama-3-70b-turbomind | 63.43 | 56.98 | 69.14 | 59.32 | 84.90 | 75.28 | 78.92 | 79.52 | 68.81 | 59.80 | 86.59 | 79.91 |
internlm2-1.8b-turbomind | 30.29 | 45.93 | 46.29 | 33.33 | 63.02 | 60.11 | 62.70 | 47.62 | 35.61 | 37.87 | 69.27 | 61.64 |
internlm2-7b-turbomind | 64.57 | 65.12 | 76.00 | 54.80 | 91.15 | 85.96 | 90.27 | 74.29 | 57.34 | 50.50 | 86.59 | 83.56 |
internlm2-20b-turbomind | 68.57 | 74.42 | 78.86 | 58.76 | 91.67 | 90.45 | 90.27 | 72.38 | 57.95 | 55.81 | 88.83 | 88.58 |
qwen-1.8b-turbomind | 55.43 | 56.98 | 61.14 | 54.80 | 85.42 | 84.83 | 85.41 | 54.76 | 43.06 | 44.19 | 83.80 | 79.91 |
qwen-7b-turbomind | 68.00 | 69.19 | 82.86 | 57.63 | 93.75 | 87.64 | 92.43 | 63.81 | 47.28 | 57.48 | 86.59 | 82.65 |
qwen-14b-turbomind | 78.86 | 83.14 | 92.57 | 67.23 | 96.88 | 95.51 | 96.76 | 73.33 | 56.94 | 64.45 | 91.62 | 86.76 |
qwen-72b-turbomind | 93.14 | 93.60 | 95.43 | 88.70 | 98.44 | 97.75 | 99.46 | 90.00 | 75.45 | 80.73 | 96.09 | 99.54 |
qwen1.5-0.5b-hf | 48.57 | 44.19 | 60.00 | 40.68 | 73.44 | 69.66 | 78.92 | 49.05 | 34.41 | 40.20 | 79.89 | 74.43 |
qwen1.5-1.8b-hf | 58.86 | 68.02 | 76.00 | 59.32 | 91.15 | 90.45 | 87.03 | 63.81 | 44.87 | 48.50 | 86.03 | 90.41 |
qwen1.5-4b-hf | 66.86 | 77.33 | 82.86 | 68.93 | 95.31 | 92.70 | 97.30 | 71.90 | 51.31 | 61.13 | 91.62 | 94.52 |
qwen1.5-7b-hf | 79.43 | 82.56 | 91.43 | 77.40 | 96.88 | 95.51 | 96.22 | 80.00 | 62.37 | 69.77 | 93.30 | 97.26 |
qwen1.5-14b-hf | 86.29 | 87.79 | 93.14 | 83.05 | 97.92 | 95.51 | 97.84 | 82.86 | 63.78 | 77.08 | 95.53 | 96.35 |
qwen1.5-32b-hf | 88.00 | 95.35 | 94.86 | 91.53 | 97.92 | 99.44 | 100.00 | 90.00 | 73.44 | 78.74 | 94.97 | 98.63 |
qwen1.5-72b-hf | 91.43 | 93.60 | 95.43 | 88.70 | 97.92 | 98.31 | 99.46 | 90.00 | 74.25 | 80.40 | 94.41 | 98.63 |
qwen1.5-moe-a2-7b-hf | 70.86 | 77.33 | 82.86 | 68.36 | 97.92 | 93.26 | 97.30 | 89.52 | 70.22 | 74.75 | 96.09 | 98.17 |
mistral-7b-v0.1-hf | 33.14 | 40.70 | 40.57 | 40.11 | 47.92 | 49.44 | 50.81 | 47.62 | 44.87 | 37.87 | 58.10 | 48.40 |
mistral-7b-v0.2-hf | 34.86 | 36.63 | 45.71 | 36.72 | 46.35 | 46.07 | 48.65 | 43.81 | 43.46 | 39.53 | 57.54 | 48.86 |
mixtral-8x7b-v0.1-hf | 49.71 | 42.44 | 53.71 | 47.46 | 62.50 | 61.24 | 60.00 | 57.62 | 52.52 | 44.52 | 68.72 | 57.99 |
mixtral-8x22b-v0.1-hf | 54.29 | 43.02 | 58.29 | 55.93 | 76.04 | 66.29 | 75.68 | 66.19 | 60.97 | 51.83 | 74.30 | 70.78 |
yi-6b-hf | 58.86 | 69.19 | 78.29 | 43.50 | 92.19 | 89.33 | 90.27 | 83.81 | 59.56 | 70.10 | 93.85 | 97.72 |
yi-34b-hf | 80.00 | 81.98 | 93.14 | 65.54 | 97.40 | 95.51 | 96.76 | 92.86 | 74.04 | 76.08 | 94.97 | 97.26 |
deepseek-7b-base-hf | 29.14 | 30.81 | 33.14 | 24.29 | 53.12 | 45.51 | 48.65 | 50.48 | 38.23 | 44.19 | 62.01 | 65.30 |
deepseek-67b-base-hf | 60.00 | 55.23 | 64.00 | 46.33 | 84.90 | 79.78 | 83.24 | 73.33 | 57.75 | 63.79 | 89.94 | 88.58 |
model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 22.96 | 31.58 | 25.57 | 29.78 | 22.80 | 25.00 | 21.70 | 21.51 | 25.00 | 26.24 | 22.49 | 25.84 |
llama-13b-turbomind | 29.26 | 30.83 | 33.52 | 36.52 | 34.72 | 33.33 | 24.06 | 40.12 | 26.47 | 33.48 | 30.14 | 29.87 |
llama-30b-turbomind | 37.41 | 46.37 | 32.95 | 38.20 | 50.78 | 40.74 | 28.77 | 45.93 | 33.33 | 32.13 | 39.23 | 22.82 |
llama-65b-turbomind | 39.63 | 51.13 | 31.82 | 39.89 | 58.03 | 42.59 | 34.91 | 55.23 | 39.71 | 30.32 | 37.80 | 32.89 |
llama-2-7b-turbomind | 27.78 | 34.34 | 31.82 | 34.83 | 35.23 | 34.26 | 28.77 | 38.95 | 32.35 | 33.94 | 27.27 | 30.87 |
llama-2-13b-turbomind | 41.48 | 47.37 | 37.50 | 37.64 | 50.78 | 52.78 | 43.40 | 48.84 | 32.35 | 38.46 | 36.36 | 30.20 |
llama-2-70b-turbomind | 57.78 | 69.17 | 50.57 | 58.43 | 69.95 | 66.67 | 50.94 | 72.09 | 50.98 | 42.53 | 44.98 | 52.01 |
llama-3-8b-turbomind | 56.30 | 65.41 | 47.16 | 56.18 | 64.25 | 61.11 | 55.66 | 67.44 | 41.67 | 40.27 | 45.45 | 50.34 |
llama-3-70b-turbomind | 72.22 | 85.46 | 75.00 | 74.72 | 84.97 | 76.85 | 75.00 | 76.16 | 59.31 | 52.94 | 62.68 | 68.46 |
internlm2-1.8b-turbomind | 47.41 | 61.40 | 55.11 | 47.75 | 61.66 | 64.81 | 61.79 | 63.95 | 32.35 | 32.58 | 48.33 | 36.58 |
internlm2-7b-turbomind | 66.67 | 85.96 | 78.98 | 74.72 | 91.71 | 87.96 | 80.66 | 80.23 | 42.16 | 50.23 | 64.11 | 70.13 |
internlm2-20b-turbomind | 69.26 | 89.22 | 83.52 | 80.34 | 90.67 | 91.67 | 83.02 | 85.47 | 49.02 | 54.30 | 72.25 | 73.15 |
qwen-1.8b-turbomind | 51.11 | 70.68 | 71.02 | 62.36 | 88.60 | 87.04 | 69.81 | 73.26 | 29.90 | 46.15 | 50.24 | 47.32 |
qwen-7b-turbomind | 57.41 | 83.71 | 88.64 | 79.78 | 93.26 | 94.44 | 75.47 | 79.07 | 42.16 | 47.96 | 59.33 | 65.10 |
qwen-14b-turbomind | 72.96 | 89.97 | 93.75 | 83.71 | 96.37 | 95.37 | 86.32 | 87.21 | 50.00 | 60.63 | 66.99 | 72.48 |
qwen-72b-turbomind | 85.56 | 96.24 | 95.45 | 93.26 | 97.93 | 97.22 | 92.45 | 91.86 | 67.65 | 76.92 | 75.12 | 83.89 |
qwen1.5-0.5b-hf | 43.33 | 63.16 | 65.91 | 56.18 | 82.90 | 79.63 | 68.87 | 70.35 | 28.43 | 37.56 | 39.23 | 32.21 |
qwen1.5-1.8b-hf | 57.41 | 76.44 | 81.25 | 75.84 | 92.75 | 91.67 | 79.72 | 81.98 | 34.31 | 47.96 | 47.85 | 43.62 |
qwen1.5-4b-hf | 65.93 | 87.47 | 86.93 | 82.58 | 94.30 | 95.37 | 84.91 | 84.30 | 40.20 | 62.90 | 58.85 | 58.72 |
qwen1.5-7b-hf | 69.26 | 91.98 | 90.91 | 89.89 | 95.85 | 94.44 | 89.15 | 87.21 | 48.04 | 67.87 | 63.16 | 68.12 |
qwen1.5-14b-hf | 78.89 | 94.99 | 94.89 | 91.57 | 96.89 | 98.15 | 91.04 | 88.37 | 57.84 | 69.68 | 66.99 | 73.83 |
qwen1.5-32b-hf | 83.70 | 95.99 | 93.75 | 94.38 | 98.45 | 97.22 | 90.57 | 91.28 | 70.10 | 76.92 | 76.56 | 80.87 |
qwen1.5-72b-hf | 84.44 | 96.49 | 96.59 | 93.82 | 98.45 | 97.22 | 92.92 | 91.28 | 66.67 | 76.92 | 74.16 | 85.23 |
qwen1.5-moe-a2-7b-hf | 80.74 | 95.49 | 89.20 | 89.33 | 94.82 | 94.44 | 92.45 | 91.28 | 52.45 | 75.57 | 67.94 | 79.87 |
mistral-7b-v0.1-hf | 45.19 | 59.15 | 43.75 | 49.44 | 56.48 | 56.48 | 45.28 | 58.14 | 37.75 | 38.91 | 40.67 | 34.56 |
mistral-7b-v0.2-hf | 45.93 | 58.65 | 38.07 | 48.31 | 63.21 | 58.33 | 41.98 | 54.07 | 35.78 | 40.27 | 38.28 | 32.21 |
mixtral-8x7b-v0.1-hf | 57.04 | 67.92 | 53.41 | 55.06 | 69.95 | 64.81 | 47.64 | 70.93 | 42.16 | 38.01 | 46.41 | 36.58 |
mixtral-8x22b-v0.1-hf | 60.37 | 72.68 | 64.77 | 65.17 | 77.20 | 71.30 | 57.08 | 75.00 | 49.51 | 43.44 | 52.63 | 49.33 |
yi-6b-hf | 79.26 | 92.48 | 77.27 | 76.40 | 92.75 | 93.52 | 89.15 | 90.12 | 60.78 | 74.66 | 61.24 | 74.16 |
yi-34b-hf | 84.81 | 96.24 | 88.07 | 88.20 | 96.37 | 96.30 | 91.98 | 91.28 | 75.00 | 78.73 | 80.38 | 82.89 |
deepseek-7b-base-hf | 52.22 | 70.18 | 47.16 | 51.12 | 60.62 | 44.44 | 58.49 | 66.86 | 31.86 | 37.56 | 53.11 | 61.07 |
deepseek-67b-base-hf | 76.67 | 89.22 | 77.27 | 78.65 | 89.64 | 78.70 | 85.85 | 84.30 | 50.00 | 64.25 | 69.38 | 84.23 |
model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
---|---|---|---|---|---|---|---|---|---|---|---|---|
llama-7b-turbomind | 29.70 | 23.72 | 27.53 | 30.22 | 30.92 | 27.04 | 22.78 | 28.64 | 28.00 | 25.00 | 26.32 | 29.80 |
llama-13b-turbomind | 25.94 | 20.93 | 25.84 | 29.67 | 24.64 | 29.60 | 26.67 | 29.15 | 33.71 | 25.50 | 28.47 | 28.44 |
llama-30b-turbomind | 29.32 | 27.91 | 30.34 | 36.26 | 37.20 | 36.13 | 36.11 | 38.69 | 34.29 | 29.50 | 38.52 | 29.35 |
llama-65b-turbomind | 28.95 | 30.70 | 30.90 | 44.51 | 35.75 | 36.60 | 45.56 | 39.20 | 37.71 | 30.00 | 39.47 | 37.02 |
llama-2-7b-turbomind | 29.70 | 30.23 | 24.72 | 29.67 | 34.78 | 30.07 | 31.11 | 31.16 | 30.29 | 25.50 | 31.34 | 27.31 |
llama-2-13b-turbomind | 30.83 | 32.56 | 24.16 | 42.31 | 45.41 | 32.87 | 36.67 | 45.23 | 38.29 | 33.50 | 35.17 | 34.31 |
llama-2-70b-turbomind | 53.76 | 38.14 | 30.34 | 58.79 | 65.70 | 43.82 | 51.11 | 58.29 | 49.71 | 42.00 | 49.76 | 46.28 |
llama-3-8b-turbomind | 52.63 | 42.33 | 27.53 | 51.65 | 65.70 | 44.52 | 54.44 | 51.26 | 46.86 | 43.00 | 46.41 | 45.15 |
llama-3-70b-turbomind | 72.93 | 52.56 | 32.58 | 71.98 | 83.57 | 56.88 | 69.44 | 78.89 | 76.00 | 67.50 | 57.89 | 59.14 |
internlm2-1.8b-turbomind | 51.50 | 38.14 | 25.84 | 56.04 | 71.50 | 47.32 | 35.00 | 43.72 | 42.29 | 39.00 | 41.15 | 36.57 |
internlm2-7b-turbomind | 72.56 | 53.49 | 52.25 | 79.67 | 90.82 | 62.00 | 62.78 | 64.32 | 66.86 | 59.50 | 55.74 | 53.50 |
internlm2-20b-turbomind | 74.06 | 54.42 | 56.18 | 81.87 | 92.27 | 61.77 | 68.33 | 69.85 | 68.00 | 63.50 | 60.77 | 58.92 |
qwen-1.8b-turbomind | 54.14 | 43.72 | 39.89 | 69.23 | 85.02 | 49.88 | 45.56 | 48.74 | 48.57 | 51.50 | 46.89 | 45.82 |
qwen-7b-turbomind | 71.05 | 48.37 | 53.93 | 81.87 | 93.72 | 59.67 | 54.44 | 62.31 | 58.29 | 57.50 | 50.24 | 56.66 |
qwen-14b-turbomind | 79.70 | 53.02 | 63.48 | 87.36 | 94.20 | 71.33 | 63.33 | 71.36 | 73.14 | 68.00 | 59.09 | 67.95 |
qwen-72b-turbomind | 90.23 | 77.21 | 79.21 | 91.76 | 96.14 | 77.86 | 86.11 | 85.43 | 91.43 | 90.50 | 76.08 | 86.68 |
qwen1.5-0.5b-hf | 44.36 | 36.74 | 39.33 | 58.24 | 78.26 | 43.36 | 40.00 | 45.23 | 41.71 | 42.50 | 43.54 | 43.12 |
qwen1.5-1.8b-hf | 59.40 | 47.91 | 37.08 | 72.53 | 91.30 | 53.61 | 53.33 | 51.26 | 49.71 | 58.00 | 51.20 | 56.21 |
qwen1.5-4b-hf | 65.04 | 58.60 | 55.62 | 83.52 | 94.20 | 62.00 | 63.89 | 65.33 | 65.71 | 64.00 | 55.26 | 61.40 |
qwen1.5-7b-hf | 78.57 | 66.51 | 66.85 | 87.91 | 94.69 | 68.07 | 65.00 | 64.82 | 77.14 | 77.50 | 60.77 | 74.49 |
qwen1.5-14b-hf | 83.08 | 72.09 | 70.22 | 90.11 | 94.20 | 69.46 | 73.89 | 70.35 | 82.29 | 83.00 | 65.31 | 78.33 |
qwen1.5-32b-hf | 87.59 | 78.14 | 79.78 | 92.86 | 95.65 | 78.32 | 80.56 | 79.90 | 90.29 | 89.00 | 77.27 | 86.68 |
qwen1.5-72b-hf | 91.35 | 76.74 | 79.21 | 91.76 | 96.14 | 79.25 | 85.56 | 86.93 | 92.00 | 90.00 | 75.84 | 86.91 |
qwen1.5-moe-a2-7b-hf | 88.35 | 75.81 | 51.12 | 79.12 | 94.69 | 67.37 | 80.56 | 73.37 | 87.43 | 84.00 | 78.23 | 82.39 |
mistral-7b-v0.1-hf | 40.23 | 39.07 | 24.16 | 41.21 | 52.17 | 41.49 | 45.00 | 52.26 | 45.14 | 42.00 | 42.58 | 44.02 |
mistral-7b-v0.2-hf | 36.84 | 34.88 | 23.03 | 43.96 | 52.66 | 40.79 | 50.00 | 50.75 | 45.14 | 40.50 | 42.58 | 40.86 |
mixtral-8x7b-v0.1-hf | 47.74 | 40.00 | 28.09 | 57.14 | 58.94 | 44.29 | 58.33 | 53.77 | 48.57 | 46.00 | 51.20 | 46.50 |
mixtral-8x22b-v0.1-hf | 59.02 | 41.86 | 29.78 | 60.99 | 71.01 | 50.82 | 57.78 | 67.34 | 62.29 | 52.00 | 53.35 | 55.98 |
yi-6b-hf | 85.34 | 67.91 | 53.93 | 80.22 | 91.79 | 65.97 | 72.22 | 72.36 | 82.29 | 84.50 | 69.86 | 71.56 |
yi-34b-hf | 94.36 | 76.74 | 65.73 | 87.91 | 95.17 | 79.25 | 85.56 | 90.95 | 90.86 | 92.00 | 76.79 | 82.39 |
deepseek-7b-base-hf | 65.79 | 29.30 | 32.58 | 47.80 | 67.15 | 37.76 | 44.44 | 52.26 | 43.43 | 36.50 | 41.15 | 37.02 |
deepseek-67b-base-hf | 83.83 | 58.60 | 45.51 | 79.67 | 90.34 | 62.47 | 70.56 | 70.85 | 81.14 | 71.50 | 61.72 | 60.05 |
model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
---|---|---|---|---|
llama-7b-turbomind | 22.34 | 24.91 | 29.12 | 27.77 |
llama-13b-turbomind | 24.11 | 30.25 | 27.77 | 30.70 |
llama-30b-turbomind | 28.72 | 31.67 | 31.83 | 36.57 |
llama-65b-turbomind | 28.37 | 39.15 | 33.63 | 35.44 |
llama-2-7b-turbomind | 22.70 | 24.91 | 25.51 | 29.80 |
llama-2-13b-turbomind | 25.53 | 35.94 | 29.35 | 35.44 |
llama-2-70b-turbomind | 36.52 | 52.67 | 36.12 | 52.60 |
llama-3-8b-turbomind | 35.46 | 49.82 | 41.31 | 55.30 |
llama-3-70b-turbomind | 48.58 | 64.41 | 52.60 | 75.40 |
internlm2-1.8b-turbomind | 32.27 | 42.35 | 39.05 | 45.15 |
internlm2-7b-turbomind | 46.81 | 55.16 | 47.63 | 67.27 |
internlm2-20b-turbomind | 45.04 | 62.63 | 51.47 | 69.75 |
qwen-1.8b-turbomind | 41.84 | 47.69 | 45.60 | 57.34 |
qwen-7b-turbomind | 41.84 | 54.80 | 48.08 | 69.53 |
qwen-14b-turbomind | 45.74 | 64.77 | 56.43 | 77.88 |
qwen-72b-turbomind | 80.50 | 74.73 | 81.04 | 89.62 |
qwen1.5-0.5b-hf | 39.36 | 41.28 | 38.37 | 48.08 |
qwen1.5-1.8b-hf | 45.74 | 49.47 | 51.69 | 63.43 |
qwen1.5-4b-hf | 50.35 | 51.60 | 58.69 | 75.17 |
qwen1.5-7b-hf | 58.51 | 65.84 | 67.04 | 81.94 |
qwen1.5-14b-hf | 63.83 | 67.26 | 72.23 | 87.36 |
qwen1.5-32b-hf | 74.47 | 73.31 | 80.14 | 90.74 |
qwen1.5-72b-hf | 79.79 | 75.09 | 81.04 | 90.07 |
qwen1.5-moe-a2-7b-hf | 74.82 | 77.58 | 79.68 | 91.65 |
mistral-7b-v0.1-hf | 32.27 | 45.91 | 37.70 | 50.56 |
mistral-7b-v0.2-hf | 32.62 | 44.13 | 36.79 | 46.28 |
mixtral-8x7b-v0.1-hf | 35.11 | 53.02 | 46.73 | 52.37 |
mixtral-8x22b-v0.1-hf | 38.65 | 56.23 | 49.21 | 59.82 |
yi-6b-hf | 67.38 | 68.68 | 69.53 | 83.07 |
yi-34b-hf | 77.66 | 83.27 | 77.43 | 89.84 |
deepseek-7b-base-hf | 30.50 | 38.79 | 35.67 | 46.28 |
deepseek-67b-base-hf | 46.81 | 65.12 | 54.40 | 77.65 |
Details on Dev Split
Chat Models
model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 36.88 | 28.83 | 34.49 | 43.46 | 37.35 | 34.76 | 38.58 | 33.90 | 33.63 | 43.81 | 41.79 | 39.59 |
qwen1.5-1.8b-chat-hf | 55.17 | 38.21 | 50.63 | 70.26 | 56.04 | 48.82 | 55.93 | 37.60 | 50.31 | 67.59 | 60.90 | 50.59 |
qwen1.5-4b-chat-hf | 61.54 | 44.79 | 56.86 | 75.84 | 62.13 | 56.46 | 62.76 | 38.32 | 55.39 | 79.53 | 65.67 | 58.00 |
qwen1.5-7b-chat-hf | 68.71 | 51.77 | 64.27 | 81.23 | 68.22 | 65.88 | 71.10 | 50.13 | 65.42 | 83.99 | 73.77 | 67.02 |
qwen1.5-14b-chat-hf | 74.80 | 56.54 | 69.46 | 87.47 | 76.46 | 71.32 | 76.35 | 52.08 | 69.68 | 86.70 | 80.56 | 74.87 |
qwen1.5-32b-chat-hf | 80.47 | 63.17 | 75.66 | 89.58 | 81.98 | 79.43 | 81.27 | 63.51 | 76.64 | 89.39 | 82.97 | 80.59 |
qwen1.5-72b-chat-hf | 81.53 | 63.62 | 75.86 | 90.74 | 83.18 | 81.84 | 82.88 | 62.44 | 77.54 | 89.80 | 86.11 | 83.07 |
qwen1.5-110b-chat-hf | 87.33 | 67.27 | 80.70 | 93.58 | 89.67 | 91.35 | 87.59 | 73.64 | 81.94 | 91.47 | 92.12 | 89.80 |
internlm2-chat-1.8b-hf | 47.04 | 34.81 | 43.28 | 59.34 | 48.24 | 41.50 | 48.51 | 36.75 | 42.23 | 57.79 | 54.83 | 45.15 |
internlm2-chat-1.8b-sft-hf | 47.19 | 35.34 | 43.49 | 59.56 | 48.30 | 41.58 | 48.75 | 35.83 | 42.04 | 59.80 | 54.84 | 44.83 |
internlm2-chat-7b-hf | 58.75 | 39.61 | 52.38 | 71.46 | 61.57 | 55.96 | 61.04 | 36.56 | 51.81 | 74.01 | 69.13 | 57.92 |
internlm2-chat-7b-sft-hf | 58.96 | 40.09 | 52.40 | 71.49 | 62.20 | 56.26 | 61.02 | 37.29 | 52.60 | 74.01 | 68.27 | 57.27 |
internlm2-chat-20b-hf | 63.12 | 42.65 | 56.21 | 75.64 | 67.15 | 60.27 | 63.45 | 34.96 | 52.84 | 79.27 | 71.50 | 60.32 |
internlm2-chat-20b-sft-hf | 63.16 | 42.70 | 56.19 | 75.74 | 67.20 | 60.37 | 63.54 | 34.96 | 52.57 | 80.33 | 71.42 | 60.34 |
llama-3-8b-instruct-hf | 50.90 | 34.54 | 46.73 | 58.73 | 49.24 | 53.04 | 52.55 | 36.37 | 48.47 | 58.03 | 53.26 | 54.26 |
llama-3-70b-instruct-hf | 67.38 | 54.02 | 65.16 | 76.83 | 62.29 | 67.92 | 67.92 | 54.50 | 66.85 | 76.80 | 65.98 | 63.72 |
llama-3-8b-instruct-lmdeploy | 49.92 | 34.75 | 46.19 | 58.49 | 47.68 | 51.14 | 50.27 | 33.32 | 46.25 | 56.93 | 49.02 | 52.76 |
llama-3-70b-instruct-lmdeploy | 66.41 | 52.76 | 64.72 | 75.31 | 61.36 | 66.44 | 68.21 | 52.28 | 65.86 | 75.06 | 68.37 | 66.09 |
mistral-7b-instruct-v0.1-hf | 36.76 | 27.76 | 35.55 | 42.41 | 34.45 | 36.12 | 40.04 | 30.21 | 35.77 | 45.15 | 40.99 | 42.22 |
mistral-7b-instruct-v0.2-hf | 40.38 | 30.26 | 38.82 | 47.66 | 37.08 | 39.91 | 43.00 | 25.97 | 38.60 | 47.44 | 48.15 | 41.82 |
mixtral-8x7b-instruct-v0.1-hf | 49.61 | 37.78 | 47.86 | 58.56 | 46.40 | 47.85 | 51.68 | 37.41 | 49.14 | 59.79 | 52.97 | 47.65 |
Details on Test Split
model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 35.67 | 36.87 | 33.68 | 33.92 | 35.23 | 28.12 | 27.17 | 26.51 | 24.84 | 28.91 | 40.18 | 25.90 |
qwen1.5-1.8b-chat-hf | 46.78 | 47.49 | 50.78 | 39.18 | 41.48 | 31.25 | 32.95 | 27.71 | 28.10 | 34.81 | 55.71 | 27.11 |
qwen1.5-4b-chat-hf | 54.39 | 54.75 | 54.92 | 44.74 | 46.02 | 43.30 | 39.31 | 31.33 | 28.10 | 45.13 | 58.90 | 43.98 |
qwen1.5-7b-chat-hf | 60.82 | 60.34 | 63.21 | 55.85 | 48.86 | 45.09 | 46.24 | 36.14 | 39.22 | 47.49 | 70.32 | 45.78 |
qwen1.5-14b-chat-hf | 69.59 | 62.57 | 64.77 | 64.91 | 55.68 | 57.14 | 49.13 | 32.53 | 43.14 | 55.16 | 76.71 | 46.99 |
qwen1.5-32b-chat-hf | 81.87 | 74.30 | 73.58 | 71.35 | 63.07 | 60.71 | 50.87 | 46.99 | 47.06 | 59.29 | 83.11 | 60.84 |
qwen1.5-72b-chat-hf | 77.78 | 75.42 | 76.17 | 73.39 | 63.64 | 62.50 | 45.09 | 45.78 | 48.37 | 59.00 | 81.74 | 60.84 |
qwen1.5-110b-chat-hf | 83.63 | 86.03 | 81.87 | 77.49 | 76.70 | 67.86 | 49.13 | 47.59 | 55.56 | 79.94 | 95.89 | 62.05 |
internlm2-chat-1.8b-hf | 42.11 | 43.58 | 44.56 | 35.38 | 32.95 | 34.82 | 32.95 | 28.92 | 32.68 | 34.22 | 53.42 | 31.93 |
internlm2-chat-1.8b-sft-hf | 42.11 | 44.13 | 43.01 | 35.09 | 34.09 | 36.16 | 32.95 | 27.11 | 33.33 | 35.10 | 51.14 | 33.13 |
internlm2-chat-7b-hf | 59.65 | 60.89 | 58.03 | 51.46 | 36.93 | 43.75 | 36.99 | 29.52 | 36.60 | 39.82 | 63.47 | 38.55 |
internlm2-chat-7b-sft-hf | 59.06 | 61.45 | 56.48 | 52.63 | 39.77 | 41.52 | 36.99 | 27.71 | 39.22 | 40.12 | 62.10 | 40.36 |
internlm2-chat-20b-hf | 61.99 | 70.39 | 63.73 | 54.97 | 33.52 | 47.77 | 43.93 | 40.96 | 44.44 | 44.25 | 61.64 | 34.34 |
internlm2-chat-20b-sft-hf | 61.40 | 70.39 | 63.21 | 54.97 | 32.95 | 47.77 | 42.20 | 42.17 | 43.14 | 44.25 | 61.64 | 32.53 |
llama-3-8b-instruct-hf | 57.31 | 58.10 | 57.51 | 51.17 | 28.41 | 35.27 | 39.31 | 32.53 | 35.29 | 38.05 | 55.25 | 27.11 |
llama-3-70b-instruct-hf | 71.93 | 74.86 | 70.98 | 67.54 | 50.57 | 57.14 | 52.60 | 53.01 | 56.21 | 47.79 | 68.95 | 43.98 |
llama-3-8b-instruct-lmdeploy | 55.56 | 57.54 | 55.44 | 48.25 | 30.11 | 33.04 | 35.84 | 31.33 | 33.33 | 38.94 | 53.88 | 31.93 |
llama-3-70b-instruct-lmdeploy | 70.76 | 77.09 | 69.95 | 67.84 | 49.43 | 54.02 | 50.87 | 54.22 | 56.21 | 47.20 | 69.86 | 42.17 |
mistral-7b-instruct-v0.1-hf | 49.12 | 47.49 | 43.52 | 39.18 | 32.39 | 28.57 | 29.48 | 24.10 | 28.10 | 37.46 | 44.29 | 23.49 |
mistral-7b-instruct-v0.2-hf | 47.95 | 53.07 | 52.85 | 42.69 | 28.41 | 26.79 | 40.46 | 30.12 | 29.41 | 33.33 | 42.92 | 24.10 |
mixtral-8x7b-instruct-v0.1-hf | 58.48 | 62.57 | 58.03 | 56.43 | 38.64 | 36.16 | 39.31 | 34.94 | 37.91 | 34.81 | 55.71 | 28.31 |
model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 30.86 | 31.98 | 44.00 | 27.68 | 47.40 | 40.45 | 55.14 | 35.24 | 32.80 | 30.56 | 58.66 | 57.53 |
qwen1.5-1.8b-chat-hf | 54.86 | 62.21 | 69.14 | 53.67 | 82.81 | 83.15 | 85.41 | 58.10 | 44.06 | 49.83 | 82.12 | 82.65 |
qwen1.5-4b-chat-hf | 58.86 | 67.44 | 80.00 | 55.93 | 89.58 | 88.20 | 88.11 | 64.29 | 47.08 | 57.48 | 86.59 | 84.93 |
qwen1.5-7b-chat-hf | 72.00 | 80.81 | 84.00 | 70.06 | 95.31 | 94.94 | 95.14 | 73.81 | 56.94 | 66.11 | 91.62 | 89.04 |
qwen1.5-14b-chat-hf | 84.00 | 83.72 | 90.29 | 80.23 | 97.92 | 94.94 | 98.38 | 81.43 | 63.18 | 74.75 | 93.30 | 96.80 |
qwen1.5-32b-chat-hf | 85.71 | 90.12 | 93.71 | 85.31 | 97.92 | 98.31 | 100.00 | 89.05 | 69.82 | 75.75 | 93.85 | 97.72 |
qwen1.5-72b-chat-hf | 88.57 | 94.19 | 94.86 | 85.31 | 97.92 | 97.75 | 98.38 | 90.48 | 71.63 | 79.73 | 93.85 | 97.72 |
qwen1.5-110b-chat-hf | 86.86 | 92.44 | 94.29 | 85.31 | 98.44 | 98.88 | 98.92 | 95.24 | 78.87 | 86.38 | 95.53 | 99.54 |
internlm2-chat-1.8b-hf | 35.43 | 48.84 | 52.00 | 35.03 | 70.31 | 67.98 | 67.03 | 41.43 | 37.83 | 36.88 | 70.95 | 60.73 |
internlm2-chat-1.8b-sft-hf | 37.71 | 48.26 | 53.14 | 34.46 | 71.35 | 67.98 | 67.57 | 41.90 | 38.63 | 37.54 | 72.63 | 60.27 |
internlm2-chat-7b-hf | 46.29 | 48.26 | 60.57 | 46.89 | 78.65 | 71.91 | 71.35 | 68.10 | 50.30 | 50.83 | 77.09 | 76.26 |
internlm2-chat-7b-sft-hf | 46.86 | 48.26 | 61.14 | 45.76 | 77.60 | 71.91 | 71.35 | 67.62 | 50.10 | 50.50 | 77.09 | 75.80 |
internlm2-chat-20b-hf | 49.71 | 46.51 | 63.43 | 55.37 | 80.73 | 74.72 | 79.46 | 72.38 | 55.73 | 59.80 | 85.47 | 76.26 |
internlm2-chat-20b-sft-hf | 53.71 | 47.09 | 64.00 | 55.37 | 80.73 | 73.60 | 78.92 | 73.81 | 55.53 | 60.13 | 85.47 | 75.80 |
llama-3-8b-instruct-hf | 38.86 | 39.53 | 50.29 | 40.11 | 65.10 | 60.11 | 63.78 | 61.43 | 47.89 | 45.85 | 69.27 | 56.16 |
llama-3-70b-instruct-hf | 63.43 | 55.23 | 69.71 | 68.36 | 85.42 | 80.90 | 78.38 | 86.19 | 69.01 | 65.12 | 83.24 | 82.65 |
llama-3-8b-instruct-lmdeploy | 41.71 | 40.70 | 52.00 | 41.24 | 61.46 | 58.43 | 65.41 | 57.62 | 45.27 | 46.18 | 69.27 | 55.71 |
llama-3-70b-instruct-lmdeploy | 61.71 | 53.49 | 70.86 | 64.97 | 88.02 | 83.71 | 77.30 | 84.76 | 68.21 | 60.80 | 80.45 | 79.91 |
mistral-7b-instruct-v0.1-hf | 27.43 | 28.49 | 36.00 | 28.25 | 40.10 | 42.70 | 43.78 | 37.14 | 32.80 | 37.87 | 41.90 | 48.86 |
mistral-7b-instruct-v0.2-hf | 33.14 | 29.65 | 44.00 | 31.07 | 47.92 | 44.94 | 49.19 | 44.29 | 37.02 | 40.86 | 53.63 | 48.40 |
mixtral-8x7b-instruct-v0.1-hf | 46.29 | 40.70 | 54.86 | 42.37 | 58.85 | 60.67 | 57.84 | 54.29 | 50.10 | 46.51 | 69.27 | 52.51 |
model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 33.33 | 46.12 | 37.50 | 37.08 | 57.51 | 43.52 | 42.45 | 51.74 | 32.84 | 31.22 | 37.32 | 24.50 |
qwen1.5-1.8b-chat-hf | 54.07 | 72.43 | 74.43 | 66.85 | 89.12 | 87.04 | 77.36 | 76.16 | 38.24 | 44.34 | 46.89 | 40.94 |
qwen1.5-4b-chat-hf | 60.00 | 84.71 | 82.39 | 69.66 | 94.82 | 90.74 | 79.72 | 78.49 | 41.67 | 57.47 | 54.07 | 56.38 |
qwen1.5-7b-chat-hf | 66.30 | 90.73 | 84.66 | 80.90 | 94.30 | 91.67 | 82.55 | 84.88 | 38.73 | 60.18 | 60.77 | 63.42 |
qwen1.5-14b-chat-hf | 74.81 | 93.73 | 90.91 | 92.13 | 96.89 | 98.15 | 89.62 | 88.37 | 54.41 | 70.14 | 69.86 | 69.13 |
qwen1.5-32b-chat-hf | 80.37 | 94.49 | 93.75 | 94.94 | 97.93 | 97.22 | 90.09 | 90.70 | 68.63 | 78.73 | 73.21 | 77.52 |
qwen1.5-72b-chat-hf | 84.07 | 96.74 | 95.45 | 94.94 | 97.93 | 95.37 | 92.92 | 91.28 | 63.73 | 80.09 | 73.68 | 83.89 |
qwen1.5-110b-chat-hf | 90.37 | 96.99 | 96.02 | 95.51 | 98.45 | 98.15 | 93.87 | 94.19 | 81.37 | 86.88 | 84.69 | 90.94 |
internlm2-chat-1.8b-hf | 48.15 | 65.41 | 69.32 | 54.49 | 79.27 | 70.37 | 60.85 | 64.53 | 32.35 | 32.58 | 45.45 | 40.60 |
internlm2-chat-1.8b-sft-hf | 48.15 | 64.91 | 69.89 | 53.93 | 79.27 | 70.37 | 61.32 | 63.95 | 33.82 | 29.86 | 45.45 | 39.93 |
internlm2-chat-7b-hf | 66.67 | 85.21 | 73.30 | 66.85 | 91.19 | 76.85 | 70.28 | 75.58 | 42.16 | 50.68 | 60.77 | 70.47 |
internlm2-chat-7b-sft-hf | 67.04 | 85.21 | 73.86 | 66.85 | 90.67 | 77.78 | 71.70 | 75.00 | 42.16 | 51.13 | 60.29 | 72.15 |
internlm2-chat-20b-hf | 74.07 | 85.96 | 75.57 | 77.53 | 89.12 | 76.85 | 72.64 | 83.72 | 51.96 | 56.11 | 68.42 | 73.49 |
internlm2-chat-20b-sft-hf | 73.70 | 85.46 | 76.70 | 78.09 | 89.64 | 76.85 | 72.17 | 84.88 | 50.00 | 56.56 | 66.99 | 75.17 |
llama-3-8b-instruct-hf | 55.93 | 67.42 | 55.68 | 55.06 | 72.02 | 62.04 | 54.25 | 66.86 | 44.12 | 40.72 | 47.37 | 44.63 |
llama-3-70b-instruct-hf | 71.11 | 84.21 | 74.43 | 73.03 | 84.97 | 80.56 | 69.81 | 78.49 | 57.35 | 50.68 | 57.89 | 64.43 |
llama-3-8b-instruct-lmdeploy | 54.81 | 67.17 | 58.52 | 53.37 | 72.54 | 62.04 | 57.08 | 63.95 | 44.12 | 37.56 | 46.89 | 42.62 |
llama-3-70b-instruct-lmdeploy | 70.37 | 82.96 | 72.16 | 71.91 | 83.94 | 82.41 | 69.34 | 77.91 | 55.39 | 50.68 | 56.46 | 64.09 |
mistral-7b-instruct-v0.1-hf | 39.63 | 46.62 | 33.52 | 41.01 | 56.48 | 45.37 | 36.32 | 43.60 | 29.90 | 31.67 | 39.71 | 31.88 |
mistral-7b-instruct-v0.2-hf | 46.30 | 54.39 | 39.20 | 43.26 | 61.66 | 51.85 | 35.38 | 55.23 | 28.92 | 35.29 | 37.80 | 29.19 |
mixtral-8x7b-instruct-v0.1-hf | 58.52 | 66.17 | 56.82 | 57.30 | 66.32 | 62.04 | 48.11 | 66.28 | 41.67 | 37.10 | 46.41 | 35.91 |
model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
---|---|---|---|---|---|---|---|---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 36.47 | 39.07 | 27.53 | 41.76 | 45.89 | 39.63 | 35.56 | 31.66 | 37.71 | 34.00 | 32.78 | 37.25 |
qwen1.5-1.8b-chat-hf | 56.02 | 45.58 | 39.33 | 67.03 | 84.54 | 49.42 | 48.89 | 51.76 | 47.43 | 50.50 | 45.69 | 52.14 |
qwen1.5-4b-chat-hf | 61.28 | 52.56 | 42.70 | 73.08 | 85.99 | 55.48 | 59.44 | 55.28 | 60.57 | 57.00 | 50.00 | 58.01 |
qwen1.5-7b-chat-hf | 73.31 | 56.28 | 58.99 | 82.97 | 88.41 | 64.57 | 66.67 | 63.82 | 77.14 | 75.50 | 57.42 | 69.07 |
qwen1.5-14b-chat-hf | 80.83 | 65.12 | 70.79 | 89.56 | 93.24 | 67.60 | 72.78 | 68.34 | 80.57 | 80.00 | 61.72 | 75.62 |
qwen1.5-32b-chat-hf | 87.59 | 72.56 | 76.40 | 90.66 | 95.65 | 74.36 | 80.00 | 80.40 | 86.86 | 84.00 | 74.88 | 85.33 |
qwen1.5-72b-chat-hf | 90.98 | 76.28 | 75.84 | 90.66 | 95.65 | 75.52 | 84.44 | 82.91 | 91.43 | 89.00 | 73.92 | 85.10 |
qwen1.5-110b-chat-hf | 95.11 | 88.37 | 82.58 | 91.76 | 96.62 | 87.65 | 91.67 | 90.95 | 93.71 | 95.00 | 87.08 | 91.87 |
internlm2-chat-1.8b-hf | 54.14 | 40.00 | 27.53 | 62.09 | 70.53 | 44.99 | 41.67 | 51.76 | 45.71 | 39.00 | 40.67 | 39.28 |
internlm2-chat-1.8b-sft-hf | 54.14 | 42.33 | 26.97 | 61.54 | 71.98 | 45.45 | 41.67 | 50.25 | 45.14 | 37.50 | 41.39 | 40.63 |
internlm2-chat-7b-hf | 70.68 | 44.19 | 34.83 | 73.63 | 84.06 | 51.98 | 57.22 | 68.34 | 66.86 | 57.50 | 54.55 | 50.11 |
internlm2-chat-7b-sft-hf | 71.80 | 44.65 | 37.64 | 73.63 | 84.06 | 51.98 | 57.78 | 67.84 | 65.71 | 60.50 | 54.55 | 50.11 |
internlm2-chat-20b-hf | 75.56 | 54.42 | 42.13 | 74.73 | 85.51 | 57.34 | 65.56 | 67.84 | 73.71 | 64.00 | 57.89 | 55.98 |
internlm2-chat-20b-sft-hf | 76.32 | 55.35 | 41.01 | 75.27 | 85.51 | 58.28 | 65.56 | 67.34 | 72.57 | 65.00 | 58.37 | 56.43 |
llama-3-8b-instruct-hf | 53.01 | 44.65 | 33.15 | 46.70 | 66.18 | 45.22 | 58.89 | 61.81 | 62.86 | 57.50 | 48.33 | 49.89 |
llama-3-70b-instruct-hf | 71.43 | 50.70 | 30.90 | 71.43 | 82.13 | 59.67 | 73.33 | 73.37 | 82.86 | 82.00 | 59.09 | 62.08 |
llama-3-8b-instruct-lmdeploy | 51.13 | 45.12 | 29.78 | 43.96 | 62.32 | 47.09 | 56.11 | 54.77 | 56.00 | 56.00 | 49.04 | 47.40 |
llama-3-70b-instruct-lmdeploy | 68.80 | 48.84 | 30.90 | 70.88 | 81.64 | 58.28 | 72.22 | 70.85 | 80.00 | 81.00 | 57.66 | 62.53 |
mistral-7b-instruct-v0.1-hf | 30.45 | 35.81 | 24.72 | 40.11 | 34.78 | 30.77 | 43.89 | 38.69 | 36.57 | 32.50 | 44.74 | 34.09 |
mistral-7b-instruct-v0.2-hf | 36.09 | 38.14 | 23.03 | 43.41 | 45.41 | 35.90 | 50.00 | 41.71 | 42.86 | 36.00 | 45.22 | 42.21 |
mixtral-8x7b-instruct-v0.1-hf | 47.37 | 44.65 | 30.34 | 51.65 | 60.87 | 42.19 | 53.89 | 58.29 | 52.00 | 47.00 | 48.56 | 44.02 |
model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
---|---|---|---|---|
qwen1.5-0.5b-chat-hf | 27.66 | 38.43 | 32.28 | 35.44 |
qwen1.5-1.8b-chat-hf | 38.65 | 46.62 | 46.73 | 59.14 |
qwen1.5-4b-chat-hf | 49.29 | 54.80 | 51.02 | 70.20 |
qwen1.5-7b-chat-hf | 53.90 | 62.28 | 57.79 | 76.52 |
qwen1.5-14b-chat-hf | 58.87 | 65.12 | 67.27 | 86.68 |
qwen1.5-32b-chat-hf | 74.11 | 70.82 | 74.94 | 88.04 |
qwen1.5-72b-chat-hf | 74.82 | 75.09 | 78.56 | 89.39 |
qwen1.5-110b-chat-hf | 88.30 | 88.97 | 94.13 | 95.49 |
internlm2-chat-1.8b-hf | 30.14 | 41.99 | 34.54 | 46.73 |
internlm2-chat-1.8b-sft-hf | 30.14 | 43.06 | 34.31 | 47.86 |
internlm2-chat-7b-hf | 42.20 | 52.31 | 47.63 | 66.82 |
internlm2-chat-7b-sft-hf | 43.26 | 52.67 | 47.86 | 66.59 |
internlm2-chat-20b-hf | 45.74 | 54.80 | 51.02 | 69.07 |
internlm2-chat-20b-sft-hf | 45.74 | 55.16 | 51.02 | 68.62 |
llama-3-8b-instruct-hf | 37.59 | 50.53 | 42.44 | 68.40 |
llama-3-70b-instruct-hf | 50.71 | 64.06 | 55.53 | 84.42 |
llama-3-8b-instruct-lmdeploy | 37.94 | 50.53 | 41.53 | 66.14 |
llama-3-70b-instruct-lmdeploy | 48.94 | 63.70 | 53.95 | 81.72 |
mistral-7b-instruct-v0.1-hf | 27.66 | 39.15 | 29.35 | 39.95 |
mistral-7b-instruct-v0.2-hf | 32.27 | 37.01 | 32.96 | 42.89 |
mixtral-8x7b-instruct-v0.1-hf | 36.88 | 48.75 | 41.76 | 53.05 |