[Sync] bump version (#1204)

This commit is contained in:
Fengzhe Zhou 2024-05-28 23:09:59 +08:00 committed by GitHub
parent ba620c4afe
commit 2954913d9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
92 changed files with 3228 additions and 795 deletions

View File

@ -35,6 +35,7 @@ repos:
exclude: |
(?x)^(
.*\.jsonl|
.*\.md.template|
configs/
)
- repo: https://github.com/pre-commit/pre-commit-hooks

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .charm_rea_gen_f8fca2 import charm_rea_datasets # noqa: F401, F403

View File

@ -0,0 +1,50 @@
import os
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
with read_base():
from .charm_reason_settings import charm_tasks, settings
settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']]
charm_reason_datasets = []
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
for _task in charm_tasks:
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
with open(_fewshot_example_file, 'r') as f:
_hint = f.read()
charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
charm_reason_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
charm_reason_eval_cfg = dict(
evaluator=dict(type=CharmReasonEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=charm_reason_postprocess),
dataset_postprocessor=dict(type=charm_reason_postprocess),
)
charm_reason_datasets.append(
dict(
type=CharmDataset,
path=dataset_path,
name=_task,
abbr='charm-reason-' + _task + '_' + _cot,
reader_cfg=charm_reason_reader_cfg,
infer_cfg=charm_reason_infer_cfg.copy(),
eval_cfg=charm_reason_eval_cfg.copy(),
)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .charm_reason_gen_f8fca2 import charm_reason_datasets # noqa: F401, F403

View File

@ -0,0 +1,49 @@
import os
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
with read_base():
from .charm_reason_settings import charm_tasks, settings
charm_reason_datasets = []
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
for _task in charm_tasks:
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
with open(_fewshot_example_file, 'r') as f:
_hint = f.read()
charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
charm_reason_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
charm_reason_eval_cfg = dict(
evaluator=dict(type=CharmReasonEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=charm_reason_postprocess),
dataset_postprocessor=dict(type=charm_reason_postprocess),
)
charm_reason_datasets.append(
dict(
type=CharmDataset,
path=dataset_path,
name=_task,
abbr='charm-reason-' + _task + '_' + _cot,
reader_cfg=charm_reason_reader_cfg,
infer_cfg=charm_reason_infer_cfg.copy(),
eval_cfg=charm_reason_eval_cfg.copy(),
)
)

View File

@ -1,8 +1,4 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CharmDataset, charm_rea_postprocess, CharmReaEvaluator
charm_tasks = [
'Chinese_Anachronisms_Judgment',
@ -21,16 +17,16 @@ charm_tasks = [
'Global_Time_Understanding',
]
XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ{{input}}\nA'
data_dir = 'data/CHARM'
dataset_path_ZH = f'{data_dir}/reasoning'
dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN'
fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples')
fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN')
XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ{{input}}\nA'
settings = [
('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
@ -38,41 +34,3 @@ settings = [
('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template),
('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template),
]
charm_rea_datasets = []
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
for _task in charm_tasks:
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
with open(_fewshot_example_file, 'r') as f:
_hint = f.read()
charm_rea_reader_cfg = dict(input_columns=['input'], output_column='target')
charm_rea_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
charm_rea_eval_cfg = dict(
evaluator=dict(type=CharmReaEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=charm_rea_postprocess),
dataset_postprocessor=dict(type=charm_rea_postprocess),
)
charm_rea_datasets.append(
dict(
type=CharmDataset,
path=dataset_path,
name=_task,
abbr='charm-rea-' + _task + '_' + _cot,
reader_cfg=charm_rea_reader_cfg,
infer_cfg=charm_rea_infer_cfg.copy(),
eval_cfg=charm_rea_eval_cfg.copy(),
)
)

View File

@ -0,0 +1,186 @@
# GaokaoBench
## Base Models
| model | GaokaoBench |
|:------------------------:|--------------:|
| llama-7b-turbomind | 14.55 |
| llama-13b-turbomind | 16.20 |
| llama-30b-turbomind | 16.14 |
| llama-65b-turbomind | 13.31 |
| llama-2-7b-turbomind | 15.02 |
| llama-2-13b-turbomind | 14.86 |
| llama-2-70b-turbomind | 16.36 |
| llama-3-8b-turbomind | 20.88 |
| llama-3-70b-turbomind | 19.98 |
| internlm2-1.8b-turbomind | 23.78 |
| internlm2-7b-turbomind | 41.41 |
| internlm2-20b-turbomind | 58.99 |
| qwen-1.8b-turbomind | 22.11 |
| qwen-7b-turbomind | 35.32 |
| qwen-14b-turbomind | 54.07 |
| qwen-72b-turbomind | 77.56 |
| qwen1.5-0.5b-hf | 30.67 |
| qwen1.5-1.8b-hf | 35.66 |
| qwen1.5-4b-hf | 54.31 |
| qwen1.5-7b-hf | 65.99 |
| qwen1.5-14b-hf | 66.60 |
| qwen1.5-32b-hf | 79.01 |
| qwen1.5-72b-hf | 80.26 |
| qwen1.5-moe-a2-7b-hf | 52.79 |
| mistral-7b-v0.1-hf | 14.35 |
| mistral-7b-v0.2-hf | 11.10 |
| mixtral-8x7b-v0.1-hf | 8.40 |
| mixtral-8x22b-v0.1-hf | 16.23 |
| yi-6b-hf | 31.70 |
| yi-34b-hf | 30.51 |
| deepseek-7b-base-hf | 17.02 |
| deepseek-67b-base-hf | 10.14 |
### Details
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
| llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 |
| llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 |
| llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 |
| llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 |
| llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 |
| llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 |
| llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 |
| llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 |
| llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 |
| internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 |
| internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 |
| internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 |
| qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 |
| qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 |
| qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 |
| qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 |
| qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 |
| qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 |
| qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 |
| qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 |
| qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 |
| qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 |
| qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 |
| qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 |
| mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 |
| mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 |
| mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 |
| mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 |
| yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 |
| yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 |
| deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 |
| deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 |
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
| llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 |
| llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 |
| llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 |
| llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 |
| llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 |
| llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 |
| llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 |
| llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 |
| llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 |
| internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 |
| internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 |
| internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 |
| qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 |
| qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 |
| qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 |
| qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 |
| qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 |
| qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 |
| qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 |
| qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 |
| qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 |
| qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 |
| qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 |
| qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 |
| mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 |
| mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 |
| mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 |
| mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 |
| yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 |
| yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 |
| deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 |
| deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 |
## Chat Models
| model | GaokaoBench |
|:-----------------------------:|--------------:|
| qwen1.5-0.5b-chat-hf | 21.51 |
| qwen1.5-1.8b-chat-hf | 46.19 |
| qwen1.5-4b-chat-hf | 59.11 |
| qwen1.5-7b-chat-hf | 70.55 |
| qwen1.5-14b-chat-hf | 80.39 |
| qwen1.5-32b-chat-hf | 86.15 |
| qwen1.5-72b-chat-hf | 88.58 |
| qwen1.5-110b-chat-hf | 89.59 |
| internlm2-chat-1.8b-hf | 29.73 |
| internlm2-chat-1.8b-sft-hf | 28.79 |
| internlm2-chat-7b-hf | 54.54 |
| internlm2-chat-7b-sft-hf | 55.39 |
| internlm2-chat-20b-hf | 57.95 |
| internlm2-chat-20b-sft-hf | 57.62 |
| llama-3-8b-instruct-hf | 45.48 |
| llama-3-70b-instruct-hf | 65.91 |
| llama-3-8b-instruct-lmdeploy | 44.48 |
| llama-3-70b-instruct-lmdeploy | 67.06 |
| mistral-7b-instruct-v0.1-hf | 26.21 |
| mistral-7b-instruct-v0.2-hf | 32.17 |
| mixtral-8x7b-instruct-v0.1-hf | 42.46 |
### Details
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
| qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 |
| qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 |
| qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 |
| qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 |
| qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 |
| qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 |
| qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 |
| qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 |
| internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 |
| internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 |
| internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 |
| internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 |
| internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 |
| internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 |
| llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 |
| llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 |
| llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 |
| llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 |
| mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 |
| mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 |
| mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 |
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
| qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 |
| qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 |
| qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 |
| qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 |
| qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 |
| qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 |
| qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 |
| qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 |
| internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 |
| internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 |
| internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 |
| internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 |
| internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 |
| internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 |
| llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 |
| llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 |
| llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 |
| llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 |
| mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 |
| mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 |
| mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 |

View File

@ -1,114 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块'),
dict(role='BOT', prompt='A: 黎恩有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖'),
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]}
mathbench_sets = {
# Practice Part
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'arithmetic': ['cloze_en'],
# Theory part
'college_knowledge': ['single_choice_cn','single_choice_en'],
'high_knowledge': ['single_choice_cn','single_choice_en'],
'middle_knowledge': ['single_choice_cn','single_choice_en'],
'primary_knowledge': ['single_choice_cn','single_choice_en'],
}
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
),
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'./data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 4
# Generate reasoning path or not, only for single choice
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = True
# Use PPL mode in single choice test or not
use_ppl_single_choice = False
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 4
# Generate reasoning path or not, only for single choice
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = False
# Use PPL mode in single choice test or not
use_ppl_single_choice = False
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-wocircular-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -1,124 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块'),
dict(role='BOT', prompt='A: 黎恩有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖'),
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
],
}
mathbench_sets = {
# Practice Part
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'calculate': ['cloze_en'],
# Theory part
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
}
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = False
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
)
if 'single_choice' in _name:
template_round = [
dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])),
dict(role='BOT', prompt='{answer}')
]
else:
template_round = cloze_prompts[_name]
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
mathbench_eval_cfg = dict(
evaluator=evaluator,
pred_postprocessor=pred_postprocessor,
)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'./data/mathbench_v1_ori/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -0,0 +1,81 @@
from mmengine.config import read_base
from copy import deepcopy
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
# Max for this dataset is 4
num_shot = 4
# Generate reasoning path or not, only for single choice
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = False
# Use PPL mode in single choice test or not
use_ppl_single_choice = True
assert 0 <= num_shot <= 4
if num_shot == 0:
prompts = zero_shot_prompts
else:
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
mathbench_datasets = []
for _split in mathbench_sets:
for _name in mathbench_sets[_split]:
if 'single_choice' in _name:
if with_reasoning:
template_round = prompts[_name + '_with_reasoning']
else:
template_round = prompts[_name]
else:
template_round = prompts[_name]
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
# assemble the final config
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = deepcopy(template_round)
one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
mathbench_datasets.append(
dict(
abbr='mathbench-wocircular-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -1,114 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n',
}
cloze_prompts = {
'cloze_cn': [
'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6',
'Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5',
'Q: 黎恩有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块\nA: 黎恩有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39',
'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖\nA: 杰森有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8',
'Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9',
'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29',
'Q: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球\nA: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33',
'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8',
'Q: {question}\nA: {answer}',
],
'cloze_en': [
'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.',
'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.',
"Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.',
'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.',
'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.',
'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.',
'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.',
'Q: {question}\nA: {answer}',
],
}
mathbench_sets = {
# Practice Part
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'calculate': ['cloze_en'],
# Theory part
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
}
# Generate reasoning path or not, only for single choice
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = False
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
)
if 'single_choice' in _name:
if with_reasoning:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
mathbench_eval_cfg = dict(
evaluator=evaluator,
pred_postprocessor=pred_postprocessor,
)
mathbench_datasets.append(
dict(
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f'./data/mathbench_v1_ori/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
)
)

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mathbench_2024_gen_19e486 import mathbench_datasets # noqa: F401, F403
from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403

View File

@ -0,0 +1,103 @@
zero_shot_prompts = {
'single_choice_cn_with_reasoning': [
dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:'),
],
'single_choice_cn': [
dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n所以答案是:'),
],
'single_choice_en_with_reasoning': [
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
],
'single_choice_en': [
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
],
}
few_shot_prompts = {
'single_choice_cn': [
dict(role='HUMAN', prompt='问题: 已知i是虚数单位z为复数$2+\\frac{1}{i}=z(3+i)$则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: 设$1+2ia+b=2i$其中a,b为实数 \nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
dict(role='BOT', prompt='回答: A'),
dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \\pi$时该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
dict(role='BOT', prompt='回答: B'),
dict(role='HUMAN', prompt='问题: {question}'),
dict(role='BOT', prompt='回答: {answer}'),
],
'single_choice_cn_with_reasoning' : [
dict(role='HUMAN', prompt='问题: 已知i是虚数单位z为复数$2+\\frac{1}{i}=z(3+i)$则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
dict(role='BOT', prompt='回答: 因为首先,我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$,以便消去分母中的虚数部分:$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$,这样做的目的是利用复数乘法的性质,从而消去分母中的虚数部分。我们进行乘法运算:$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$,由于$i^2 = -1$,我们可以进一步简化:$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$,在复平面上,这个点位于第四象限,因为它的实部是正的($\\frac{1}{2}$),而虚部是负的($-\\frac{1}{2}$\n答案选项D'),
dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$(其中$\\omega>0$的图像向左平移2个单位长度后得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合,我们需要满足以下条件:\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性,我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$,其中$k$是任意整数。因此,我们可以写出:\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程,我们得到:$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$,我们需要找到满足条件的最小正整数$k$。当$k=0$时,我们得到$\\omega=2$这是满足条件的最小正值。因此最小值等于2。答案选项D'),
dict(role='HUMAN', prompt='问题: 设$1+2ia+b=2i$其中a,b为实数 \nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
dict(role='BOT', prompt='回答: 我们可以将这个方程写成:$a + 2ai + b = 2i$,然后将实部和虚部分开:\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$,我们可以得到$a = -b$。将这个值代入$2a = 2$,我们得到$2(-b) = 2$,因此$b = -1$。因此,$a = 1$。答案选项A'),
dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们,如果一个三角形的两条边长分别为$a$和$b$,那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中,$a = 3$$b = 4$,因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项A'),
dict(role='HUMAN', prompt='问题: {question}'),
dict(role='BOT', prompt='回答: {answer}'),
],
'single_choice_en': [
dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
dict(role='BOT', prompt='Response: D'),
dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
dict(role='BOT', prompt='Response: D'),
dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
dict(role='BOT', prompt='Response: A'),
dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
dict(role='BOT', prompt='Response: B'),
dict(role='HUMAN', prompt='Question: {question}'),
dict(role='BOT', prompt='Response: {answer}'),
],
'single_choice_en_with_reasoning': [
dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
dict(role='HUMAN', prompt='Question: {question}'),
dict(role='BOT', prompt='Response: {answer}'),
],
'cloze_cn': [
dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9'),
dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29'),
dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33'),
dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8'),
dict(role='HUMAN', prompt='问题: {question}'),
dict(role='BOT', prompt='回答: {answer}'),
],
'cloze_en': [
dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
dict(role='HUMAN', prompt='Question: {question}'),
dict(role='BOT', prompt='Response: {answer}'),
],
}
mathbench_sets = {
# Practice Part
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'arithmetic': ['cloze_en'],
# Theory part
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
}

View File

@ -0,0 +1,64 @@
# TheoremQA
## Base Models
| model | TheoremQA |
|:------------------------:|------------:|
| llama-7b-turbomind | 10.25 |
| llama-13b-turbomind | 11.25 |
| llama-30b-turbomind | 14.25 |
| llama-65b-turbomind | 15.62 |
| llama-2-7b-turbomind | 12.62 |
| llama-2-13b-turbomind | 11.88 |
| llama-2-70b-turbomind | 15.62 |
| llama-3-8b-turbomind | 20.25 |
| llama-3-70b-turbomind | 33.62 |
| internlm2-1.8b-turbomind | 10.50 |
| internlm2-7b-turbomind | 21.88 |
| internlm2-20b-turbomind | 26.00 |
| qwen-1.8b-turbomind | 9.38 |
| qwen-7b-turbomind | 15.00 |
| qwen-14b-turbomind | 21.62 |
| qwen-72b-turbomind | 27.12 |
| qwen1.5-0.5b-hf | 5.88 |
| qwen1.5-1.8b-hf | 12.00 |
| qwen1.5-4b-hf | 13.75 |
| qwen1.5-7b-hf | 4.25 |
| qwen1.5-14b-hf | 12.62 |
| qwen1.5-32b-hf | 26.62 |
| qwen1.5-72b-hf | 26.62 |
| qwen1.5-moe-a2-7b-hf | 7.50 |
| mistral-7b-v0.1-hf | 17.00 |
| mistral-7b-v0.2-hf | 16.25 |
| mixtral-8x7b-v0.1-hf | 24.12 |
| mixtral-8x22b-v0.1-hf | 36.75 |
| yi-6b-hf | 13.88 |
| yi-34b-hf | 24.75 |
| deepseek-7b-base-hf | 12.38 |
| deepseek-67b-base-hf | 21.25 |
## Chat Models
| model | TheoremQA |
|:-----------------------------:|------------:|
| qwen1.5-0.5b-chat-hf | 9.00 |
| qwen1.5-1.8b-chat-hf | 9.25 |
| qwen1.5-4b-chat-hf | 13.88 |
| qwen1.5-7b-chat-hf | 12.25 |
| qwen1.5-14b-chat-hf | 13.63 |
| qwen1.5-32b-chat-hf | 19.25 |
| qwen1.5-72b-chat-hf | 22.75 |
| qwen1.5-110b-chat-hf | 17.50 |
| internlm2-chat-1.8b-hf | 13.63 |
| internlm2-chat-1.8b-sft-hf | 12.88 |
| internlm2-chat-7b-hf | 18.50 |
| internlm2-chat-7b-sft-hf | 18.75 |
| internlm2-chat-20b-hf | 23.00 |
| internlm2-chat-20b-sft-hf | 25.12 |
| llama-3-8b-instruct-hf | 19.38 |
| llama-3-70b-instruct-hf | 36.25 |
| llama-3-8b-instruct-lmdeploy | 19.62 |
| llama-3-70b-instruct-lmdeploy | 34.50 |
| mistral-7b-instruct-v0.1-hf | 12.62 |
| mistral-7b-instruct-v0.2-hf | 11.38 |
| mixtral-8x7b-instruct-v0.1-hf | 26.00 |

View File

@ -0,0 +1,245 @@
# BBH
## Base Models
| model | bbh |
|:------------------------:|------:|
| llama-7b-turbomind | 33.34 |
| llama-13b-turbomind | 37.99 |
| llama-30b-turbomind | 49.86 |
| llama-65b-turbomind | 58.26 |
| llama-2-7b-turbomind | 38.27 |
| llama-2-13b-turbomind | 45.68 |
| llama-2-70b-turbomind | 64.78 |
| llama-3-8b-turbomind | 59.69 |
| llama-3-70b-turbomind | 79.16 |
| internlm2-1.8b-turbomind | 36.03 |
| internlm2-7b-turbomind | 63.56 |
| internlm2-20b-turbomind | 71.29 |
| qwen-1.8b-turbomind | 22.53 |
| qwen-7b-turbomind | 45.89 |
| qwen-14b-turbomind | 56.75 |
| qwen-72b-turbomind | 63.35 |
| qwen1.5-0.5b-hf | 20.54 |
| qwen1.5-1.8b-hf | 27.01 |
| qwen1.5-4b-hf | 34.81 |
| qwen1.5-7b-hf | 39.87 |
| qwen1.5-14b-hf | 50.38 |
| qwen1.5-32b-hf | 67.47 |
| qwen1.5-72b-hf | 58.81 |
| qwen1.5-moe-a2-7b-hf | 39.46 |
| mistral-7b-v0.1-hf | 56.71 |
| mistral-7b-v0.2-hf | 57.32 |
| mixtral-8x7b-v0.1-hf | 68.46 |
| mixtral-8x22b-v0.1-hf | 79.48 |
| yi-6b-hf | 44.82 |
| yi-34b-hf | 66.37 |
| deepseek-7b-base-hf | 42.88 |
| deepseek-67b-base-hf | 71.86 |
### Details
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
| llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 |
| llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 |
| llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 |
| llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 |
| llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 |
| llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 |
| llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 |
| llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 |
| llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 |
| internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 |
| internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 |
| internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 |
| qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 |
| qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 |
| qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 |
| qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 |
| qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 |
| qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 |
| qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 |
| qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 |
| qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 |
| qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 |
| qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 |
| qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 |
| mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 |
| mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 |
| mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 |
| mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 |
| yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 |
| yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 |
| deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 |
| deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 |
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
| llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 |
| llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 |
| llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 |
| llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 |
| llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 |
| llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 |
| llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 |
| llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 |
| llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 |
| internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 |
| internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 |
| internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 |
| qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 |
| qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 |
| qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 |
| qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 |
| qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 |
| qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 |
| qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 |
| qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 |
| qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 |
| qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 |
| qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 |
| qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 |
| mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 |
| mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 |
| mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 |
| mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 |
| yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 |
| yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 |
| deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 |
| deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 |
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
| llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 |
| llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 |
| llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 |
| llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 |
| llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 |
| llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 |
| llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 |
| llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 |
| llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 |
| internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 |
| internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 |
| internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 |
| qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 |
| qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 |
| qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 |
| qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 |
| qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 |
| qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 |
| qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 |
| qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 |
| qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 |
| qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 |
| qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 |
| qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 |
| mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 |
| mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 |
| mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 |
| mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 |
| yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 |
| yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 |
| deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 |
| deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 |
## Chat Models
| model | bbh |
|:-----------------------------:|------:|
| qwen1.5-0.5b-chat-hf | 24.12 |
| qwen1.5-1.8b-chat-hf | 26.82 |
| qwen1.5-4b-chat-hf | 43.15 |
| qwen1.5-7b-chat-hf | 38.12 |
| qwen1.5-14b-chat-hf | 55.38 |
| qwen1.5-32b-chat-hf | 69.28 |
| qwen1.5-72b-chat-hf | 72.97 |
| qwen1.5-110b-chat-hf | 71.04 |
| internlm2-chat-1.8b-hf | 37.69 |
| internlm2-chat-1.8b-sft-hf | 37.12 |
| internlm2-chat-7b-hf | 57.83 |
| internlm2-chat-7b-sft-hf | 57.19 |
| internlm2-chat-20b-hf | 68.24 |
| internlm2-chat-20b-sft-hf | 69.38 |
| llama-3-8b-instruct-hf | 52.85 |
| llama-3-70b-instruct-hf | 82.42 |
| llama-3-8b-instruct-lmdeploy | 53.54 |
| llama-3-70b-instruct-lmdeploy | 82.58 |
| mistral-7b-instruct-v0.1-hf | 32.88 |
| mistral-7b-instruct-v0.2-hf | 48.84 |
| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
### Details
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
| qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 |
| qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 |
| qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 |
| qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 |
| qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 |
| qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 |
| qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 |
| qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 |
| internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 |
| internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 |
| internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 |
| internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 |
| internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 |
| internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 |
| llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 |
| llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 |
| llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 |
| llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 |
| mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 |
| mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 |
| mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 |
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
| qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 |
| qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 |
| qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 |
| qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 |
| qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 |
| qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 |
| qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 |
| qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 |
| internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 |
| internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 |
| internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 |
| internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 |
| internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 |
| internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 |
| llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 |
| llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 |
| llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 |
| llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 |
| mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 |
| mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 |
| mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 |
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
| qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 |
| qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 |
| qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 |
| qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 |
| qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 |
| qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 |
| qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 |
| qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 |
| internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 |
| internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 |
| internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 |
| internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 |
| internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 |
| internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 |
| llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 |
| llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 |
| llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 |
| llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 |
| mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 |
| mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 |
| mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 |

View File

@ -0,0 +1,367 @@
# C-Eval
## Base Models
| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
| llama-7b-turbomind | 26.61 | 27.75 | 27.20 | 26.31 | 25.90 | 26.52 | 27.44 | 27.68 | 27.16 | 29.49 | 24.18 | 29.36 |
| llama-13b-turbomind | 29.18 | 25.59 | 27.66 | 33.86 | 28.29 | 28.58 | 31.75 | 30.32 | 31.39 | 35.22 | 30.16 | 30.82 |
| llama-30b-turbomind | 35.09 | 31.68 | 34.56 | 39.89 | 33.02 | 33.76 | 37.70 | 31.97 | 34.80 | 42.72 | 41.19 | 34.93 |
| llama-65b-turbomind | 37.98 | 29.47 | 36.03 | 45.03 | 36.51 | 36.56 | 40.46 | 33.76 | 36.37 | 46.47 | 42.26 | 40.63 |
| llama-2-7b-turbomind | 30.13 | 26.26 | 29.29 | 33.02 | 31.02 | 28.15 | 32.70 | 25.85 | 28.75 | 39.75 | 37.04 | 29.13 |
| llama-2-13b-turbomind | 37.38 | 30.81 | 35.85 | 43.98 | 36.81 | 34.75 | 40.43 | 31.34 | 35.67 | 45.75 | 45.32 | 39.36 |
| llama-2-70b-turbomind | 49.53 | 33.48 | 44.73 | 60.19 | 50.93 | 47.17 | 50.26 | 32.53 | 44.83 | 59.44 | 54.45 | 47.58 |
| llama-3-8b-turbomind | 48.83 | 34.47 | 46.02 | 56.48 | 49.15 | 46.69 | 50.45 | 33.76 | 45.94 | 58.08 | 50.93 | 51.25 |
| llama-3-70b-turbomind | 66.56 | 54.09 | 64.08 | 76.43 | 64.38 | 64.25 | 67.30 | 52.35 | 62.67 | 77.89 | 69.76 | 63.65 |
| internlm2-1.8b-turbomind | 44.79 | 33.93 | 41.19 | 54.26 | 47.15 | 40.35 | 46.64 | 33.00 | 38.62 | 57.28 | 51.30 | 46.89 |
| internlm2-7b-turbomind | 63.54 | 45.32 | 58.10 | 76.40 | 66.94 | 58.32 | 64.23 | 40.09 | 54.37 | 76.88 | 70.11 | 64.77 |
| internlm2-20b-turbomind | 67.28 | 50.15 | 62.33 | 79.59 | 70.55 | 61.82 | 66.73 | 42.50 | 59.25 | 79.98 | 73.43 | 61.56 |
| qwen-1.8b-turbomind | 54.24 | 38.60 | 50.02 | 68.18 | 55.33 | 48.13 | 53.78 | 33.38 | 46.36 | 68.40 | 57.57 | 50.17 |
| qwen-7b-turbomind | 62.06 | 42.73 | 56.21 | 77.12 | 65.28 | 55.76 | 63.23 | 36.99 | 54.74 | 78.55 | 68.94 | 59.02 |
| qwen-14b-turbomind | 70.33 | 53.61 | 65.25 | 83.19 | 72.85 | 65.37 | 72.05 | 55.03 | 66.07 | 85.59 | 74.91 | 67.78 |
| qwen-72b-turbomind | 83.25 | 66.78 | 78.44 | 91.75 | 83.86 | 83.63 | 83.60 | 63.68 | 78.05 | 90.25 | 87.13 | 84.13 |
| qwen1.5-0.5b-hf | 48.36 | 35.55 | 44.72 | 62.00 | 48.51 | 42.41 | 50.43 | 37.00 | 46.28 | 62.64 | 48.11 | 49.18 |
| qwen1.5-1.8b-hf | 58.67 | 40.98 | 53.91 | 74.52 | 58.51 | 53.06 | 59.38 | 43.02 | 53.45 | 75.88 | 60.06 | 54.47 |
| qwen1.5-4b-hf | 66.55 | 48.50 | 61.45 | 81.12 | 67.90 | 61.22 | 66.46 | 43.12 | 56.76 | 82.89 | 67.61 | 68.03 |
| qwen1.5-7b-hf | 72.49 | 52.90 | 66.77 | 85.50 | 74.37 | 69.19 | 73.57 | 49.16 | 66.32 | 84.23 | 77.30 | 73.34 |
| qwen1.5-14b-hf | 76.93 | 60.50 | 72.08 | 88.81 | 77.95 | 73.94 | 77.86 | 54.81 | 71.55 | 86.79 | 82.86 | 76.23 |
| qwen1.5-32b-hf | 82.50 | 66.67 | 77.97 | 90.93 | 83.66 | 81.88 | 82.79 | 71.06 | 80.01 | 89.02 | 83.36 | 81.62 |
| qwen1.5-72b-hf | 83.03 | 65.09 | 77.90 | 91.47 | 83.85 | 83.86 | 83.72 | 64.09 | 77.26 | 91.87 | 87.64 | 84.14 |
| qwen1.5-moe-a2-7b-hf | 76.67 | 51.37 | 68.89 | 88.33 | 77.15 | 79.73 | 77.90 | 51.25 | 67.27 | 89.28 | 83.16 | 81.60 |
| mistral-7b-v0.1-hf | 43.76 | 33.85 | 42.23 | 49.97 | 41.10 | 43.54 | 47.54 | 33.97 | 44.74 | 54.80 | 51.52 | 42.06 |
| mistral-7b-v0.2-hf | 42.81 | 32.84 | 41.00 | 50.19 | 39.45 | 42.77 | 46.44 | 31.67 | 42.89 | 54.50 | 48.75 | 43.23 |
| mixtral-8x7b-v0.1-hf | 51.15 | 41.46 | 50.93 | 59.19 | 46.69 | 48.72 | 55.31 | 42.04 | 52.78 | 62.00 | 56.44 | 52.71 |
| mixtral-8x22b-v0.1-hf | 58.13 | 48.31 | 58.01 | 66.94 | 53.60 | 54.86 | 60.50 | 45.67 | 57.44 | 71.27 | 61.31 | 55.47 |
| yi-6b-hf | 70.78 | 43.72 | 60.54 | 83.29 | 75.39 | 73.40 | 73.13 | 46.87 | 63.14 | 85.52 | 78.70 | 74.45 |
| yi-34b-hf | 80.93 | 58.51 | 73.48 | 89.24 | 83.65 | 84.18 | 81.62 | 56.95 | 71.64 | 89.73 | 87.49 | 86.53 |
| deepseek-7b-base-hf | 43.68 | 28.90 | 37.03 | 53.55 | 50.14 | 40.34 | 45.07 | 31.94 | 38.81 | 56.68 | 47.10 | 43.85 |
| deepseek-67b-base-hf | 66.66 | 44.25 | 57.89 | 79.02 | 72.36 | 65.66 | 66.65 | 38.62 | 56.65 | 79.56 | 73.72 | 66.01 |
### Details on Test Split
| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
| llama-7b-turbomind | 29.82 | 25.70 | 26.94 | 30.99 | 32.95 | 23.66 | 26.01 | 22.89 | 27.45 | 30.09 | 26.48 | 33.13 |
| llama-13b-turbomind | 33.33 | 37.99 | 31.09 | 29.82 | 22.16 | 27.23 | 31.79 | 27.11 | 24.84 | 28.02 | 33.33 | 30.72 |
| llama-30b-turbomind | 40.94 | 48.60 | 40.41 | 34.21 | 32.95 | 35.71 | 36.42 | 32.53 | 27.45 | 31.56 | 36.07 | 30.12 |
| llama-65b-turbomind | 41.52 | 50.84 | 44.04 | 40.94 | 27.84 | 29.46 | 28.32 | 30.72 | 29.41 | 35.10 | 42.47 | 30.12 |
| llama-2-7b-turbomind | 33.92 | 37.99 | 34.72 | 30.99 | 26.70 | 21.88 | 31.79 | 25.30 | 24.18 | 31.56 | 39.73 | 30.12 |
| llama-2-13b-turbomind | 40.94 | 46.93 | 37.82 | 36.26 | 30.68 | 29.46 | 35.84 | 30.72 | 24.84 | 32.74 | 42.92 | 34.94 |
| llama-2-70b-turbomind | 55.56 | 58.66 | 53.89 | 47.95 | 34.09 | 33.48 | 32.95 | 27.11 | 34.64 | 37.76 | 57.99 | 29.52 |
| llama-3-8b-turbomind | 55.56 | 58.66 | 55.96 | 51.17 | 27.27 | 35.27 | 36.42 | 31.33 | 34.64 | 40.12 | 50.68 | 30.72 |
| llama-3-70b-turbomind | 69.59 | 75.98 | 69.95 | 71.64 | 49.43 | 58.04 | 52.02 | 53.01 | 58.82 | 45.72 | 68.95 | 40.96 |
| internlm2-1.8b-turbomind | 40.35 | 40.78 | 39.38 | 32.16 | 34.66 | 34.38 | 31.21 | 31.33 | 35.95 | 35.10 | 51.60 | 27.71 |
| internlm2-7b-turbomind | 56.14 | 57.54 | 62.69 | 49.42 | 43.75 | 48.21 | 34.68 | 32.53 | 33.33 | 41.00 | 60.27 | 40.36 |
| internlm2-20b-turbomind | 62.57 | 65.36 | 66.84 | 58.77 | 43.18 | 51.79 | 39.31 | 40.36 | 35.95 | 42.77 | 66.67 | 47.59 |
| qwen-1.8b-turbomind | 46.20 | 41.90 | 46.63 | 36.84 | 40.34 | 36.61 | 27.75 | 28.92 | 32.68 | 36.58 | 57.08 | 30.12 |
| qwen-7b-turbomind | 52.63 | 54.75 | 54.40 | 46.20 | 35.80 | 44.20 | 36.99 | 27.71 | 26.80 | 38.35 | 57.99 | 33.13 |
| qwen-14b-turbomind | 58.48 | 64.80 | 59.07 | 54.68 | 45.45 | 57.59 | 45.09 | 33.73 | 39.22 | 49.26 | 67.58 | 45.78 |
| qwen-72b-turbomind | 83.04 | 73.74 | 79.27 | 76.61 | 75.00 | 64.29 | 49.13 | 44.58 | 46.41 | 66.37 | 85.84 | 68.07 |
| qwen1.5-0.5b-hf | 37.43 | 40.22 | 41.45 | 35.09 | 40.91 | 34.82 | 30.06 | 27.11 | 26.80 | 29.79 | 54.34 | 31.93 |
| qwen1.5-1.8b-hf | 47.37 | 50.84 | 47.67 | 38.30 | 43.18 | 35.27 | 29.48 | 30.12 | 33.99 | 39.53 | 58.90 | 28.92 |
| qwen1.5-4b-hf | 62.57 | 56.98 | 56.99 | 46.78 | 48.30 | 45.98 | 40.46 | 34.34 | 31.37 | 46.61 | 62.10 | 43.37 |
| qwen1.5-7b-hf | 66.08 | 62.57 | 66.32 | 55.56 | 54.55 | 47.77 | 41.62 | 31.93 | 35.95 | 49.85 | 74.43 | 49.40 |
| qwen1.5-14b-hf | 71.35 | 66.48 | 68.39 | 64.91 | 57.95 | 65.62 | 41.62 | 40.36 | 47.71 | 56.64 | 79.45 | 56.63 |
| qwen1.5-32b-hf | 84.80 | 73.18 | 74.61 | 70.18 | 71.59 | 61.61 | 49.13 | 45.78 | 49.02 | 61.95 | 87.67 | 72.89 |
| qwen1.5-72b-hf | 85.38 | 73.74 | 78.24 | 78.36 | 72.73 | 63.39 | 43.35 | 40.96 | 49.02 | 65.78 | 85.84 | 66.27 |
| qwen1.5-moe-a2-7b-hf | 77.78 | 73.74 | 68.91 | 64.91 | 66.48 | 49.11 | 33.53 | 36.75 | 35.95 | 61.06 | 91.32 | 40.96 |
| mistral-7b-v0.1-hf | 55.56 | 55.31 | 56.99 | 48.25 | 39.77 | 39.29 | 33.53 | 25.90 | 31.37 | 35.99 | 45.21 | 27.11 |
| mistral-7b-v0.2-hf | 56.14 | 53.63 | 55.44 | 47.66 | 36.36 | 34.38 | 32.37 | 25.30 | 33.33 | 31.86 | 45.21 | 29.52 |
| mixtral-8x7b-v0.1-hf | 62.57 | 64.80 | 60.10 | 60.53 | 38.64 | 42.41 | 40.46 | 37.35 | 45.75 | 35.99 | 60.27 | 34.94 |
| mixtral-8x22b-v0.1-hf | 65.50 | 74.86 | 63.73 | 65.79 | 46.59 | 52.68 | 52.02 | 45.78 | 52.94 | 42.77 | 62.56 | 39.16 |
| yi-6b-hf | 68.42 | 63.13 | 69.43 | 57.89 | 42.05 | 48.66 | 31.79 | 33.13 | 28.76 | 49.85 | 74.89 | 37.35 |
| yi-34b-hf | 83.63 | 80.45 | 74.09 | 68.42 | 62.50 | 60.27 | 45.09 | 38.55 | 50.33 | 65.19 | 88.58 | 49.40 |
| deepseek-7b-base-hf | 44.44 | 44.13 | 44.56 | 36.26 | 30.68 | 29.02 | 32.37 | 24.70 | 26.14 | 35.99 | 48.86 | 28.31 |
| deepseek-67b-base-hf | 63.16 | 70.39 | 65.80 | 59.36 | 42.61 | 45.54 | 35.84 | 38.55 | 42.48 | 44.54 | 68.95 | 33.73 |
| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
| llama-7b-turbomind | 29.14 | 26.74 | 24.57 | 29.94 | 22.92 | 23.60 | 20.00 | 30.95 | 29.98 | 24.58 | 25.70 | 25.11 |
| llama-13b-turbomind | 22.29 | 18.60 | 28.00 | 26.55 | 26.56 | 25.28 | 19.46 | 29.05 | 28.77 | 28.57 | 39.66 | 43.38 |
| llama-30b-turbomind | 25.14 | 33.14 | 36.00 | 31.07 | 39.06 | 28.09 | 33.51 | 38.10 | 35.21 | 35.88 | 48.04 | 33.33 |
| llama-65b-turbomind | 33.71 | 26.16 | 38.29 | 33.90 | 44.27 | 36.52 | 38.92 | 38.10 | 37.42 | 42.19 | 59.22 | 48.40 |
| llama-2-7b-turbomind | 26.86 | 23.26 | 26.86 | 28.81 | 28.12 | 29.78 | 22.70 | 30.48 | 31.79 | 30.56 | 33.52 | 36.07 |
| llama-2-13b-turbomind | 28.00 | 31.98 | 36.57 | 36.72 | 38.54 | 36.52 | 37.84 | 46.67 | 37.02 | 36.54 | 57.54 | 41.10 |
| llama-2-70b-turbomind | 40.00 | 36.05 | 48.00 | 36.72 | 66.67 | 55.06 | 55.68 | 52.86 | 51.91 | 48.50 | 68.16 | 60.73 |
| llama-3-8b-turbomind | 41.71 | 38.37 | 50.86 | 36.16 | 61.98 | 63.48 | 63.78 | 56.19 | 41.65 | 49.17 | 69.27 | 54.34 |
| llama-3-70b-turbomind | 63.43 | 56.98 | 69.14 | 59.32 | 84.90 | 75.28 | 78.92 | 79.52 | 68.81 | 59.80 | 86.59 | 79.91 |
| internlm2-1.8b-turbomind | 30.29 | 45.93 | 46.29 | 33.33 | 63.02 | 60.11 | 62.70 | 47.62 | 35.61 | 37.87 | 69.27 | 61.64 |
| internlm2-7b-turbomind | 64.57 | 65.12 | 76.00 | 54.80 | 91.15 | 85.96 | 90.27 | 74.29 | 57.34 | 50.50 | 86.59 | 83.56 |
| internlm2-20b-turbomind | 68.57 | 74.42 | 78.86 | 58.76 | 91.67 | 90.45 | 90.27 | 72.38 | 57.95 | 55.81 | 88.83 | 88.58 |
| qwen-1.8b-turbomind | 55.43 | 56.98 | 61.14 | 54.80 | 85.42 | 84.83 | 85.41 | 54.76 | 43.06 | 44.19 | 83.80 | 79.91 |
| qwen-7b-turbomind | 68.00 | 69.19 | 82.86 | 57.63 | 93.75 | 87.64 | 92.43 | 63.81 | 47.28 | 57.48 | 86.59 | 82.65 |
| qwen-14b-turbomind | 78.86 | 83.14 | 92.57 | 67.23 | 96.88 | 95.51 | 96.76 | 73.33 | 56.94 | 64.45 | 91.62 | 86.76 |
| qwen-72b-turbomind | 93.14 | 93.60 | 95.43 | 88.70 | 98.44 | 97.75 | 99.46 | 90.00 | 75.45 | 80.73 | 96.09 | 99.54 |
| qwen1.5-0.5b-hf | 48.57 | 44.19 | 60.00 | 40.68 | 73.44 | 69.66 | 78.92 | 49.05 | 34.41 | 40.20 | 79.89 | 74.43 |
| qwen1.5-1.8b-hf | 58.86 | 68.02 | 76.00 | 59.32 | 91.15 | 90.45 | 87.03 | 63.81 | 44.87 | 48.50 | 86.03 | 90.41 |
| qwen1.5-4b-hf | 66.86 | 77.33 | 82.86 | 68.93 | 95.31 | 92.70 | 97.30 | 71.90 | 51.31 | 61.13 | 91.62 | 94.52 |
| qwen1.5-7b-hf | 79.43 | 82.56 | 91.43 | 77.40 | 96.88 | 95.51 | 96.22 | 80.00 | 62.37 | 69.77 | 93.30 | 97.26 |
| qwen1.5-14b-hf | 86.29 | 87.79 | 93.14 | 83.05 | 97.92 | 95.51 | 97.84 | 82.86 | 63.78 | 77.08 | 95.53 | 96.35 |
| qwen1.5-32b-hf | 88.00 | 95.35 | 94.86 | 91.53 | 97.92 | 99.44 | 100.00 | 90.00 | 73.44 | 78.74 | 94.97 | 98.63 |
| qwen1.5-72b-hf | 91.43 | 93.60 | 95.43 | 88.70 | 97.92 | 98.31 | 99.46 | 90.00 | 74.25 | 80.40 | 94.41 | 98.63 |
| qwen1.5-moe-a2-7b-hf | 70.86 | 77.33 | 82.86 | 68.36 | 97.92 | 93.26 | 97.30 | 89.52 | 70.22 | 74.75 | 96.09 | 98.17 |
| mistral-7b-v0.1-hf | 33.14 | 40.70 | 40.57 | 40.11 | 47.92 | 49.44 | 50.81 | 47.62 | 44.87 | 37.87 | 58.10 | 48.40 |
| mistral-7b-v0.2-hf | 34.86 | 36.63 | 45.71 | 36.72 | 46.35 | 46.07 | 48.65 | 43.81 | 43.46 | 39.53 | 57.54 | 48.86 |
| mixtral-8x7b-v0.1-hf | 49.71 | 42.44 | 53.71 | 47.46 | 62.50 | 61.24 | 60.00 | 57.62 | 52.52 | 44.52 | 68.72 | 57.99 |
| mixtral-8x22b-v0.1-hf | 54.29 | 43.02 | 58.29 | 55.93 | 76.04 | 66.29 | 75.68 | 66.19 | 60.97 | 51.83 | 74.30 | 70.78 |
| yi-6b-hf | 58.86 | 69.19 | 78.29 | 43.50 | 92.19 | 89.33 | 90.27 | 83.81 | 59.56 | 70.10 | 93.85 | 97.72 |
| yi-34b-hf | 80.00 | 81.98 | 93.14 | 65.54 | 97.40 | 95.51 | 96.76 | 92.86 | 74.04 | 76.08 | 94.97 | 97.26 |
| deepseek-7b-base-hf | 29.14 | 30.81 | 33.14 | 24.29 | 53.12 | 45.51 | 48.65 | 50.48 | 38.23 | 44.19 | 62.01 | 65.30 |
| deepseek-67b-base-hf | 60.00 | 55.23 | 64.00 | 46.33 | 84.90 | 79.78 | 83.24 | 73.33 | 57.75 | 63.79 | 89.94 | 88.58 |
| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
| llama-7b-turbomind | 22.96 | 31.58 | 25.57 | 29.78 | 22.80 | 25.00 | 21.70 | 21.51 | 25.00 | 26.24 | 22.49 | 25.84 |
| llama-13b-turbomind | 29.26 | 30.83 | 33.52 | 36.52 | 34.72 | 33.33 | 24.06 | 40.12 | 26.47 | 33.48 | 30.14 | 29.87 |
| llama-30b-turbomind | 37.41 | 46.37 | 32.95 | 38.20 | 50.78 | 40.74 | 28.77 | 45.93 | 33.33 | 32.13 | 39.23 | 22.82 |
| llama-65b-turbomind | 39.63 | 51.13 | 31.82 | 39.89 | 58.03 | 42.59 | 34.91 | 55.23 | 39.71 | 30.32 | 37.80 | 32.89 |
| llama-2-7b-turbomind | 27.78 | 34.34 | 31.82 | 34.83 | 35.23 | 34.26 | 28.77 | 38.95 | 32.35 | 33.94 | 27.27 | 30.87 |
| llama-2-13b-turbomind | 41.48 | 47.37 | 37.50 | 37.64 | 50.78 | 52.78 | 43.40 | 48.84 | 32.35 | 38.46 | 36.36 | 30.20 |
| llama-2-70b-turbomind | 57.78 | 69.17 | 50.57 | 58.43 | 69.95 | 66.67 | 50.94 | 72.09 | 50.98 | 42.53 | 44.98 | 52.01 |
| llama-3-8b-turbomind | 56.30 | 65.41 | 47.16 | 56.18 | 64.25 | 61.11 | 55.66 | 67.44 | 41.67 | 40.27 | 45.45 | 50.34 |
| llama-3-70b-turbomind | 72.22 | 85.46 | 75.00 | 74.72 | 84.97 | 76.85 | 75.00 | 76.16 | 59.31 | 52.94 | 62.68 | 68.46 |
| internlm2-1.8b-turbomind | 47.41 | 61.40 | 55.11 | 47.75 | 61.66 | 64.81 | 61.79 | 63.95 | 32.35 | 32.58 | 48.33 | 36.58 |
| internlm2-7b-turbomind | 66.67 | 85.96 | 78.98 | 74.72 | 91.71 | 87.96 | 80.66 | 80.23 | 42.16 | 50.23 | 64.11 | 70.13 |
| internlm2-20b-turbomind | 69.26 | 89.22 | 83.52 | 80.34 | 90.67 | 91.67 | 83.02 | 85.47 | 49.02 | 54.30 | 72.25 | 73.15 |
| qwen-1.8b-turbomind | 51.11 | 70.68 | 71.02 | 62.36 | 88.60 | 87.04 | 69.81 | 73.26 | 29.90 | 46.15 | 50.24 | 47.32 |
| qwen-7b-turbomind | 57.41 | 83.71 | 88.64 | 79.78 | 93.26 | 94.44 | 75.47 | 79.07 | 42.16 | 47.96 | 59.33 | 65.10 |
| qwen-14b-turbomind | 72.96 | 89.97 | 93.75 | 83.71 | 96.37 | 95.37 | 86.32 | 87.21 | 50.00 | 60.63 | 66.99 | 72.48 |
| qwen-72b-turbomind | 85.56 | 96.24 | 95.45 | 93.26 | 97.93 | 97.22 | 92.45 | 91.86 | 67.65 | 76.92 | 75.12 | 83.89 |
| qwen1.5-0.5b-hf | 43.33 | 63.16 | 65.91 | 56.18 | 82.90 | 79.63 | 68.87 | 70.35 | 28.43 | 37.56 | 39.23 | 32.21 |
| qwen1.5-1.8b-hf | 57.41 | 76.44 | 81.25 | 75.84 | 92.75 | 91.67 | 79.72 | 81.98 | 34.31 | 47.96 | 47.85 | 43.62 |
| qwen1.5-4b-hf | 65.93 | 87.47 | 86.93 | 82.58 | 94.30 | 95.37 | 84.91 | 84.30 | 40.20 | 62.90 | 58.85 | 58.72 |
| qwen1.5-7b-hf | 69.26 | 91.98 | 90.91 | 89.89 | 95.85 | 94.44 | 89.15 | 87.21 | 48.04 | 67.87 | 63.16 | 68.12 |
| qwen1.5-14b-hf | 78.89 | 94.99 | 94.89 | 91.57 | 96.89 | 98.15 | 91.04 | 88.37 | 57.84 | 69.68 | 66.99 | 73.83 |
| qwen1.5-32b-hf | 83.70 | 95.99 | 93.75 | 94.38 | 98.45 | 97.22 | 90.57 | 91.28 | 70.10 | 76.92 | 76.56 | 80.87 |
| qwen1.5-72b-hf | 84.44 | 96.49 | 96.59 | 93.82 | 98.45 | 97.22 | 92.92 | 91.28 | 66.67 | 76.92 | 74.16 | 85.23 |
| qwen1.5-moe-a2-7b-hf | 80.74 | 95.49 | 89.20 | 89.33 | 94.82 | 94.44 | 92.45 | 91.28 | 52.45 | 75.57 | 67.94 | 79.87 |
| mistral-7b-v0.1-hf | 45.19 | 59.15 | 43.75 | 49.44 | 56.48 | 56.48 | 45.28 | 58.14 | 37.75 | 38.91 | 40.67 | 34.56 |
| mistral-7b-v0.2-hf | 45.93 | 58.65 | 38.07 | 48.31 | 63.21 | 58.33 | 41.98 | 54.07 | 35.78 | 40.27 | 38.28 | 32.21 |
| mixtral-8x7b-v0.1-hf | 57.04 | 67.92 | 53.41 | 55.06 | 69.95 | 64.81 | 47.64 | 70.93 | 42.16 | 38.01 | 46.41 | 36.58 |
| mixtral-8x22b-v0.1-hf | 60.37 | 72.68 | 64.77 | 65.17 | 77.20 | 71.30 | 57.08 | 75.00 | 49.51 | 43.44 | 52.63 | 49.33 |
| yi-6b-hf | 79.26 | 92.48 | 77.27 | 76.40 | 92.75 | 93.52 | 89.15 | 90.12 | 60.78 | 74.66 | 61.24 | 74.16 |
| yi-34b-hf | 84.81 | 96.24 | 88.07 | 88.20 | 96.37 | 96.30 | 91.98 | 91.28 | 75.00 | 78.73 | 80.38 | 82.89 |
| deepseek-7b-base-hf | 52.22 | 70.18 | 47.16 | 51.12 | 60.62 | 44.44 | 58.49 | 66.86 | 31.86 | 37.56 | 53.11 | 61.07 |
| deepseek-67b-base-hf | 76.67 | 89.22 | 77.27 | 78.65 | 89.64 | 78.70 | 85.85 | 84.30 | 50.00 | 64.25 | 69.38 | 84.23 |
| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
| llama-7b-turbomind | 29.70 | 23.72 | 27.53 | 30.22 | 30.92 | 27.04 | 22.78 | 28.64 | 28.00 | 25.00 | 26.32 | 29.80 |
| llama-13b-turbomind | 25.94 | 20.93 | 25.84 | 29.67 | 24.64 | 29.60 | 26.67 | 29.15 | 33.71 | 25.50 | 28.47 | 28.44 |
| llama-30b-turbomind | 29.32 | 27.91 | 30.34 | 36.26 | 37.20 | 36.13 | 36.11 | 38.69 | 34.29 | 29.50 | 38.52 | 29.35 |
| llama-65b-turbomind | 28.95 | 30.70 | 30.90 | 44.51 | 35.75 | 36.60 | 45.56 | 39.20 | 37.71 | 30.00 | 39.47 | 37.02 |
| llama-2-7b-turbomind | 29.70 | 30.23 | 24.72 | 29.67 | 34.78 | 30.07 | 31.11 | 31.16 | 30.29 | 25.50 | 31.34 | 27.31 |
| llama-2-13b-turbomind | 30.83 | 32.56 | 24.16 | 42.31 | 45.41 | 32.87 | 36.67 | 45.23 | 38.29 | 33.50 | 35.17 | 34.31 |
| llama-2-70b-turbomind | 53.76 | 38.14 | 30.34 | 58.79 | 65.70 | 43.82 | 51.11 | 58.29 | 49.71 | 42.00 | 49.76 | 46.28 |
| llama-3-8b-turbomind | 52.63 | 42.33 | 27.53 | 51.65 | 65.70 | 44.52 | 54.44 | 51.26 | 46.86 | 43.00 | 46.41 | 45.15 |
| llama-3-70b-turbomind | 72.93 | 52.56 | 32.58 | 71.98 | 83.57 | 56.88 | 69.44 | 78.89 | 76.00 | 67.50 | 57.89 | 59.14 |
| internlm2-1.8b-turbomind | 51.50 | 38.14 | 25.84 | 56.04 | 71.50 | 47.32 | 35.00 | 43.72 | 42.29 | 39.00 | 41.15 | 36.57 |
| internlm2-7b-turbomind | 72.56 | 53.49 | 52.25 | 79.67 | 90.82 | 62.00 | 62.78 | 64.32 | 66.86 | 59.50 | 55.74 | 53.50 |
| internlm2-20b-turbomind | 74.06 | 54.42 | 56.18 | 81.87 | 92.27 | 61.77 | 68.33 | 69.85 | 68.00 | 63.50 | 60.77 | 58.92 |
| qwen-1.8b-turbomind | 54.14 | 43.72 | 39.89 | 69.23 | 85.02 | 49.88 | 45.56 | 48.74 | 48.57 | 51.50 | 46.89 | 45.82 |
| qwen-7b-turbomind | 71.05 | 48.37 | 53.93 | 81.87 | 93.72 | 59.67 | 54.44 | 62.31 | 58.29 | 57.50 | 50.24 | 56.66 |
| qwen-14b-turbomind | 79.70 | 53.02 | 63.48 | 87.36 | 94.20 | 71.33 | 63.33 | 71.36 | 73.14 | 68.00 | 59.09 | 67.95 |
| qwen-72b-turbomind | 90.23 | 77.21 | 79.21 | 91.76 | 96.14 | 77.86 | 86.11 | 85.43 | 91.43 | 90.50 | 76.08 | 86.68 |
| qwen1.5-0.5b-hf | 44.36 | 36.74 | 39.33 | 58.24 | 78.26 | 43.36 | 40.00 | 45.23 | 41.71 | 42.50 | 43.54 | 43.12 |
| qwen1.5-1.8b-hf | 59.40 | 47.91 | 37.08 | 72.53 | 91.30 | 53.61 | 53.33 | 51.26 | 49.71 | 58.00 | 51.20 | 56.21 |
| qwen1.5-4b-hf | 65.04 | 58.60 | 55.62 | 83.52 | 94.20 | 62.00 | 63.89 | 65.33 | 65.71 | 64.00 | 55.26 | 61.40 |
| qwen1.5-7b-hf | 78.57 | 66.51 | 66.85 | 87.91 | 94.69 | 68.07 | 65.00 | 64.82 | 77.14 | 77.50 | 60.77 | 74.49 |
| qwen1.5-14b-hf | 83.08 | 72.09 | 70.22 | 90.11 | 94.20 | 69.46 | 73.89 | 70.35 | 82.29 | 83.00 | 65.31 | 78.33 |
| qwen1.5-32b-hf | 87.59 | 78.14 | 79.78 | 92.86 | 95.65 | 78.32 | 80.56 | 79.90 | 90.29 | 89.00 | 77.27 | 86.68 |
| qwen1.5-72b-hf | 91.35 | 76.74 | 79.21 | 91.76 | 96.14 | 79.25 | 85.56 | 86.93 | 92.00 | 90.00 | 75.84 | 86.91 |
| qwen1.5-moe-a2-7b-hf | 88.35 | 75.81 | 51.12 | 79.12 | 94.69 | 67.37 | 80.56 | 73.37 | 87.43 | 84.00 | 78.23 | 82.39 |
| mistral-7b-v0.1-hf | 40.23 | 39.07 | 24.16 | 41.21 | 52.17 | 41.49 | 45.00 | 52.26 | 45.14 | 42.00 | 42.58 | 44.02 |
| mistral-7b-v0.2-hf | 36.84 | 34.88 | 23.03 | 43.96 | 52.66 | 40.79 | 50.00 | 50.75 | 45.14 | 40.50 | 42.58 | 40.86 |
| mixtral-8x7b-v0.1-hf | 47.74 | 40.00 | 28.09 | 57.14 | 58.94 | 44.29 | 58.33 | 53.77 | 48.57 | 46.00 | 51.20 | 46.50 |
| mixtral-8x22b-v0.1-hf | 59.02 | 41.86 | 29.78 | 60.99 | 71.01 | 50.82 | 57.78 | 67.34 | 62.29 | 52.00 | 53.35 | 55.98 |
| yi-6b-hf | 85.34 | 67.91 | 53.93 | 80.22 | 91.79 | 65.97 | 72.22 | 72.36 | 82.29 | 84.50 | 69.86 | 71.56 |
| yi-34b-hf | 94.36 | 76.74 | 65.73 | 87.91 | 95.17 | 79.25 | 85.56 | 90.95 | 90.86 | 92.00 | 76.79 | 82.39 |
| deepseek-7b-base-hf | 65.79 | 29.30 | 32.58 | 47.80 | 67.15 | 37.76 | 44.44 | 52.26 | 43.43 | 36.50 | 41.15 | 37.02 |
| deepseek-67b-base-hf | 83.83 | 58.60 | 45.51 | 79.67 | 90.34 | 62.47 | 70.56 | 70.85 | 81.14 | 71.50 | 61.72 | 60.05 |
| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
| llama-7b-turbomind | 22.34 | 24.91 | 29.12 | 27.77 |
| llama-13b-turbomind | 24.11 | 30.25 | 27.77 | 30.70 |
| llama-30b-turbomind | 28.72 | 31.67 | 31.83 | 36.57 |
| llama-65b-turbomind | 28.37 | 39.15 | 33.63 | 35.44 |
| llama-2-7b-turbomind | 22.70 | 24.91 | 25.51 | 29.80 |
| llama-2-13b-turbomind | 25.53 | 35.94 | 29.35 | 35.44 |
| llama-2-70b-turbomind | 36.52 | 52.67 | 36.12 | 52.60 |
| llama-3-8b-turbomind | 35.46 | 49.82 | 41.31 | 55.30 |
| llama-3-70b-turbomind | 48.58 | 64.41 | 52.60 | 75.40 |
| internlm2-1.8b-turbomind | 32.27 | 42.35 | 39.05 | 45.15 |
| internlm2-7b-turbomind | 46.81 | 55.16 | 47.63 | 67.27 |
| internlm2-20b-turbomind | 45.04 | 62.63 | 51.47 | 69.75 |
| qwen-1.8b-turbomind | 41.84 | 47.69 | 45.60 | 57.34 |
| qwen-7b-turbomind | 41.84 | 54.80 | 48.08 | 69.53 |
| qwen-14b-turbomind | 45.74 | 64.77 | 56.43 | 77.88 |
| qwen-72b-turbomind | 80.50 | 74.73 | 81.04 | 89.62 |
| qwen1.5-0.5b-hf | 39.36 | 41.28 | 38.37 | 48.08 |
| qwen1.5-1.8b-hf | 45.74 | 49.47 | 51.69 | 63.43 |
| qwen1.5-4b-hf | 50.35 | 51.60 | 58.69 | 75.17 |
| qwen1.5-7b-hf | 58.51 | 65.84 | 67.04 | 81.94 |
| qwen1.5-14b-hf | 63.83 | 67.26 | 72.23 | 87.36 |
| qwen1.5-32b-hf | 74.47 | 73.31 | 80.14 | 90.74 |
| qwen1.5-72b-hf | 79.79 | 75.09 | 81.04 | 90.07 |
| qwen1.5-moe-a2-7b-hf | 74.82 | 77.58 | 79.68 | 91.65 |
| mistral-7b-v0.1-hf | 32.27 | 45.91 | 37.70 | 50.56 |
| mistral-7b-v0.2-hf | 32.62 | 44.13 | 36.79 | 46.28 |
| mixtral-8x7b-v0.1-hf | 35.11 | 53.02 | 46.73 | 52.37 |
| mixtral-8x22b-v0.1-hf | 38.65 | 56.23 | 49.21 | 59.82 |
| yi-6b-hf | 67.38 | 68.68 | 69.53 | 83.07 |
| yi-34b-hf | 77.66 | 83.27 | 77.43 | 89.84 |
| deepseek-7b-base-hf | 30.50 | 38.79 | 35.67 | 46.28 |
| deepseek-67b-base-hf | 46.81 | 65.12 | 54.40 | 77.65 |
### Details on Dev Split
## Chat Models
| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
| qwen1.5-0.5b-chat-hf | 36.88 | 28.83 | 34.49 | 43.46 | 37.35 | 34.76 | 38.58 | 33.90 | 33.63 | 43.81 | 41.79 | 39.59 |
| qwen1.5-1.8b-chat-hf | 55.17 | 38.21 | 50.63 | 70.26 | 56.04 | 48.82 | 55.93 | 37.60 | 50.31 | 67.59 | 60.90 | 50.59 |
| qwen1.5-4b-chat-hf | 61.54 | 44.79 | 56.86 | 75.84 | 62.13 | 56.46 | 62.76 | 38.32 | 55.39 | 79.53 | 65.67 | 58.00 |
| qwen1.5-7b-chat-hf | 68.71 | 51.77 | 64.27 | 81.23 | 68.22 | 65.88 | 71.10 | 50.13 | 65.42 | 83.99 | 73.77 | 67.02 |
| qwen1.5-14b-chat-hf | 74.80 | 56.54 | 69.46 | 87.47 | 76.46 | 71.32 | 76.35 | 52.08 | 69.68 | 86.70 | 80.56 | 74.87 |
| qwen1.5-32b-chat-hf | 80.47 | 63.17 | 75.66 | 89.58 | 81.98 | 79.43 | 81.27 | 63.51 | 76.64 | 89.39 | 82.97 | 80.59 |
| qwen1.5-72b-chat-hf | 81.53 | 63.62 | 75.86 | 90.74 | 83.18 | 81.84 | 82.88 | 62.44 | 77.54 | 89.80 | 86.11 | 83.07 |
| qwen1.5-110b-chat-hf | 87.33 | 67.27 | 80.70 | 93.58 | 89.67 | 91.35 | 87.59 | 73.64 | 81.94 | 91.47 | 92.12 | 89.80 |
| internlm2-chat-1.8b-hf | 47.04 | 34.81 | 43.28 | 59.34 | 48.24 | 41.50 | 48.51 | 36.75 | 42.23 | 57.79 | 54.83 | 45.15 |
| internlm2-chat-1.8b-sft-hf | 47.19 | 35.34 | 43.49 | 59.56 | 48.30 | 41.58 | 48.75 | 35.83 | 42.04 | 59.80 | 54.84 | 44.83 |
| internlm2-chat-7b-hf | 58.75 | 39.61 | 52.38 | 71.46 | 61.57 | 55.96 | 61.04 | 36.56 | 51.81 | 74.01 | 69.13 | 57.92 |
| internlm2-chat-7b-sft-hf | 58.96 | 40.09 | 52.40 | 71.49 | 62.20 | 56.26 | 61.02 | 37.29 | 52.60 | 74.01 | 68.27 | 57.27 |
| internlm2-chat-20b-hf | 63.12 | 42.65 | 56.21 | 75.64 | 67.15 | 60.27 | 63.45 | 34.96 | 52.84 | 79.27 | 71.50 | 60.32 |
| internlm2-chat-20b-sft-hf | 63.16 | 42.70 | 56.19 | 75.74 | 67.20 | 60.37 | 63.54 | 34.96 | 52.57 | 80.33 | 71.42 | 60.34 |
| llama-3-8b-instruct-hf | 50.90 | 34.54 | 46.73 | 58.73 | 49.24 | 53.04 | 52.55 | 36.37 | 48.47 | 58.03 | 53.26 | 54.26 |
| llama-3-70b-instruct-hf | 67.38 | 54.02 | 65.16 | 76.83 | 62.29 | 67.92 | 67.92 | 54.50 | 66.85 | 76.80 | 65.98 | 63.72 |
| llama-3-8b-instruct-lmdeploy | 49.92 | 34.75 | 46.19 | 58.49 | 47.68 | 51.14 | 50.27 | 33.32 | 46.25 | 56.93 | 49.02 | 52.76 |
| llama-3-70b-instruct-lmdeploy | 66.41 | 52.76 | 64.72 | 75.31 | 61.36 | 66.44 | 68.21 | 52.28 | 65.86 | 75.06 | 68.37 | 66.09 |
| mistral-7b-instruct-v0.1-hf | 36.76 | 27.76 | 35.55 | 42.41 | 34.45 | 36.12 | 40.04 | 30.21 | 35.77 | 45.15 | 40.99 | 42.22 |
| mistral-7b-instruct-v0.2-hf | 40.38 | 30.26 | 38.82 | 47.66 | 37.08 | 39.91 | 43.00 | 25.97 | 38.60 | 47.44 | 48.15 | 41.82 |
| mixtral-8x7b-instruct-v0.1-hf | 49.61 | 37.78 | 47.86 | 58.56 | 46.40 | 47.85 | 51.68 | 37.41 | 49.14 | 59.79 | 52.97 | 47.65 |
### Details on Test Split
| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
| qwen1.5-0.5b-chat-hf | 35.67 | 36.87 | 33.68 | 33.92 | 35.23 | 28.12 | 27.17 | 26.51 | 24.84 | 28.91 | 40.18 | 25.90 |
| qwen1.5-1.8b-chat-hf | 46.78 | 47.49 | 50.78 | 39.18 | 41.48 | 31.25 | 32.95 | 27.71 | 28.10 | 34.81 | 55.71 | 27.11 |
| qwen1.5-4b-chat-hf | 54.39 | 54.75 | 54.92 | 44.74 | 46.02 | 43.30 | 39.31 | 31.33 | 28.10 | 45.13 | 58.90 | 43.98 |
| qwen1.5-7b-chat-hf | 60.82 | 60.34 | 63.21 | 55.85 | 48.86 | 45.09 | 46.24 | 36.14 | 39.22 | 47.49 | 70.32 | 45.78 |
| qwen1.5-14b-chat-hf | 69.59 | 62.57 | 64.77 | 64.91 | 55.68 | 57.14 | 49.13 | 32.53 | 43.14 | 55.16 | 76.71 | 46.99 |
| qwen1.5-32b-chat-hf | 81.87 | 74.30 | 73.58 | 71.35 | 63.07 | 60.71 | 50.87 | 46.99 | 47.06 | 59.29 | 83.11 | 60.84 |
| qwen1.5-72b-chat-hf | 77.78 | 75.42 | 76.17 | 73.39 | 63.64 | 62.50 | 45.09 | 45.78 | 48.37 | 59.00 | 81.74 | 60.84 |
| qwen1.5-110b-chat-hf | 83.63 | 86.03 | 81.87 | 77.49 | 76.70 | 67.86 | 49.13 | 47.59 | 55.56 | 79.94 | 95.89 | 62.05 |
| internlm2-chat-1.8b-hf | 42.11 | 43.58 | 44.56 | 35.38 | 32.95 | 34.82 | 32.95 | 28.92 | 32.68 | 34.22 | 53.42 | 31.93 |
| internlm2-chat-1.8b-sft-hf | 42.11 | 44.13 | 43.01 | 35.09 | 34.09 | 36.16 | 32.95 | 27.11 | 33.33 | 35.10 | 51.14 | 33.13 |
| internlm2-chat-7b-hf | 59.65 | 60.89 | 58.03 | 51.46 | 36.93 | 43.75 | 36.99 | 29.52 | 36.60 | 39.82 | 63.47 | 38.55 |
| internlm2-chat-7b-sft-hf | 59.06 | 61.45 | 56.48 | 52.63 | 39.77 | 41.52 | 36.99 | 27.71 | 39.22 | 40.12 | 62.10 | 40.36 |
| internlm2-chat-20b-hf | 61.99 | 70.39 | 63.73 | 54.97 | 33.52 | 47.77 | 43.93 | 40.96 | 44.44 | 44.25 | 61.64 | 34.34 |
| internlm2-chat-20b-sft-hf | 61.40 | 70.39 | 63.21 | 54.97 | 32.95 | 47.77 | 42.20 | 42.17 | 43.14 | 44.25 | 61.64 | 32.53 |
| llama-3-8b-instruct-hf | 57.31 | 58.10 | 57.51 | 51.17 | 28.41 | 35.27 | 39.31 | 32.53 | 35.29 | 38.05 | 55.25 | 27.11 |
| llama-3-70b-instruct-hf | 71.93 | 74.86 | 70.98 | 67.54 | 50.57 | 57.14 | 52.60 | 53.01 | 56.21 | 47.79 | 68.95 | 43.98 |
| llama-3-8b-instruct-lmdeploy | 55.56 | 57.54 | 55.44 | 48.25 | 30.11 | 33.04 | 35.84 | 31.33 | 33.33 | 38.94 | 53.88 | 31.93 |
| llama-3-70b-instruct-lmdeploy | 70.76 | 77.09 | 69.95 | 67.84 | 49.43 | 54.02 | 50.87 | 54.22 | 56.21 | 47.20 | 69.86 | 42.17 |
| mistral-7b-instruct-v0.1-hf | 49.12 | 47.49 | 43.52 | 39.18 | 32.39 | 28.57 | 29.48 | 24.10 | 28.10 | 37.46 | 44.29 | 23.49 |
| mistral-7b-instruct-v0.2-hf | 47.95 | 53.07 | 52.85 | 42.69 | 28.41 | 26.79 | 40.46 | 30.12 | 29.41 | 33.33 | 42.92 | 24.10 |
| mixtral-8x7b-instruct-v0.1-hf | 58.48 | 62.57 | 58.03 | 56.43 | 38.64 | 36.16 | 39.31 | 34.94 | 37.91 | 34.81 | 55.71 | 28.31 |
| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
| qwen1.5-0.5b-chat-hf | 30.86 | 31.98 | 44.00 | 27.68 | 47.40 | 40.45 | 55.14 | 35.24 | 32.80 | 30.56 | 58.66 | 57.53 |
| qwen1.5-1.8b-chat-hf | 54.86 | 62.21 | 69.14 | 53.67 | 82.81 | 83.15 | 85.41 | 58.10 | 44.06 | 49.83 | 82.12 | 82.65 |
| qwen1.5-4b-chat-hf | 58.86 | 67.44 | 80.00 | 55.93 | 89.58 | 88.20 | 88.11 | 64.29 | 47.08 | 57.48 | 86.59 | 84.93 |
| qwen1.5-7b-chat-hf | 72.00 | 80.81 | 84.00 | 70.06 | 95.31 | 94.94 | 95.14 | 73.81 | 56.94 | 66.11 | 91.62 | 89.04 |
| qwen1.5-14b-chat-hf | 84.00 | 83.72 | 90.29 | 80.23 | 97.92 | 94.94 | 98.38 | 81.43 | 63.18 | 74.75 | 93.30 | 96.80 |
| qwen1.5-32b-chat-hf | 85.71 | 90.12 | 93.71 | 85.31 | 97.92 | 98.31 | 100.00 | 89.05 | 69.82 | 75.75 | 93.85 | 97.72 |
| qwen1.5-72b-chat-hf | 88.57 | 94.19 | 94.86 | 85.31 | 97.92 | 97.75 | 98.38 | 90.48 | 71.63 | 79.73 | 93.85 | 97.72 |
| qwen1.5-110b-chat-hf | 86.86 | 92.44 | 94.29 | 85.31 | 98.44 | 98.88 | 98.92 | 95.24 | 78.87 | 86.38 | 95.53 | 99.54 |
| internlm2-chat-1.8b-hf | 35.43 | 48.84 | 52.00 | 35.03 | 70.31 | 67.98 | 67.03 | 41.43 | 37.83 | 36.88 | 70.95 | 60.73 |
| internlm2-chat-1.8b-sft-hf | 37.71 | 48.26 | 53.14 | 34.46 | 71.35 | 67.98 | 67.57 | 41.90 | 38.63 | 37.54 | 72.63 | 60.27 |
| internlm2-chat-7b-hf | 46.29 | 48.26 | 60.57 | 46.89 | 78.65 | 71.91 | 71.35 | 68.10 | 50.30 | 50.83 | 77.09 | 76.26 |
| internlm2-chat-7b-sft-hf | 46.86 | 48.26 | 61.14 | 45.76 | 77.60 | 71.91 | 71.35 | 67.62 | 50.10 | 50.50 | 77.09 | 75.80 |
| internlm2-chat-20b-hf | 49.71 | 46.51 | 63.43 | 55.37 | 80.73 | 74.72 | 79.46 | 72.38 | 55.73 | 59.80 | 85.47 | 76.26 |
| internlm2-chat-20b-sft-hf | 53.71 | 47.09 | 64.00 | 55.37 | 80.73 | 73.60 | 78.92 | 73.81 | 55.53 | 60.13 | 85.47 | 75.80 |
| llama-3-8b-instruct-hf | 38.86 | 39.53 | 50.29 | 40.11 | 65.10 | 60.11 | 63.78 | 61.43 | 47.89 | 45.85 | 69.27 | 56.16 |
| llama-3-70b-instruct-hf | 63.43 | 55.23 | 69.71 | 68.36 | 85.42 | 80.90 | 78.38 | 86.19 | 69.01 | 65.12 | 83.24 | 82.65 |
| llama-3-8b-instruct-lmdeploy | 41.71 | 40.70 | 52.00 | 41.24 | 61.46 | 58.43 | 65.41 | 57.62 | 45.27 | 46.18 | 69.27 | 55.71 |
| llama-3-70b-instruct-lmdeploy | 61.71 | 53.49 | 70.86 | 64.97 | 88.02 | 83.71 | 77.30 | 84.76 | 68.21 | 60.80 | 80.45 | 79.91 |
| mistral-7b-instruct-v0.1-hf | 27.43 | 28.49 | 36.00 | 28.25 | 40.10 | 42.70 | 43.78 | 37.14 | 32.80 | 37.87 | 41.90 | 48.86 |
| mistral-7b-instruct-v0.2-hf | 33.14 | 29.65 | 44.00 | 31.07 | 47.92 | 44.94 | 49.19 | 44.29 | 37.02 | 40.86 | 53.63 | 48.40 |
| mixtral-8x7b-instruct-v0.1-hf | 46.29 | 40.70 | 54.86 | 42.37 | 58.85 | 60.67 | 57.84 | 54.29 | 50.10 | 46.51 | 69.27 | 52.51 |
| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
| qwen1.5-0.5b-chat-hf | 33.33 | 46.12 | 37.50 | 37.08 | 57.51 | 43.52 | 42.45 | 51.74 | 32.84 | 31.22 | 37.32 | 24.50 |
| qwen1.5-1.8b-chat-hf | 54.07 | 72.43 | 74.43 | 66.85 | 89.12 | 87.04 | 77.36 | 76.16 | 38.24 | 44.34 | 46.89 | 40.94 |
| qwen1.5-4b-chat-hf | 60.00 | 84.71 | 82.39 | 69.66 | 94.82 | 90.74 | 79.72 | 78.49 | 41.67 | 57.47 | 54.07 | 56.38 |
| qwen1.5-7b-chat-hf | 66.30 | 90.73 | 84.66 | 80.90 | 94.30 | 91.67 | 82.55 | 84.88 | 38.73 | 60.18 | 60.77 | 63.42 |
| qwen1.5-14b-chat-hf | 74.81 | 93.73 | 90.91 | 92.13 | 96.89 | 98.15 | 89.62 | 88.37 | 54.41 | 70.14 | 69.86 | 69.13 |
| qwen1.5-32b-chat-hf | 80.37 | 94.49 | 93.75 | 94.94 | 97.93 | 97.22 | 90.09 | 90.70 | 68.63 | 78.73 | 73.21 | 77.52 |
| qwen1.5-72b-chat-hf | 84.07 | 96.74 | 95.45 | 94.94 | 97.93 | 95.37 | 92.92 | 91.28 | 63.73 | 80.09 | 73.68 | 83.89 |
| qwen1.5-110b-chat-hf | 90.37 | 96.99 | 96.02 | 95.51 | 98.45 | 98.15 | 93.87 | 94.19 | 81.37 | 86.88 | 84.69 | 90.94 |
| internlm2-chat-1.8b-hf | 48.15 | 65.41 | 69.32 | 54.49 | 79.27 | 70.37 | 60.85 | 64.53 | 32.35 | 32.58 | 45.45 | 40.60 |
| internlm2-chat-1.8b-sft-hf | 48.15 | 64.91 | 69.89 | 53.93 | 79.27 | 70.37 | 61.32 | 63.95 | 33.82 | 29.86 | 45.45 | 39.93 |
| internlm2-chat-7b-hf | 66.67 | 85.21 | 73.30 | 66.85 | 91.19 | 76.85 | 70.28 | 75.58 | 42.16 | 50.68 | 60.77 | 70.47 |
| internlm2-chat-7b-sft-hf | 67.04 | 85.21 | 73.86 | 66.85 | 90.67 | 77.78 | 71.70 | 75.00 | 42.16 | 51.13 | 60.29 | 72.15 |
| internlm2-chat-20b-hf | 74.07 | 85.96 | 75.57 | 77.53 | 89.12 | 76.85 | 72.64 | 83.72 | 51.96 | 56.11 | 68.42 | 73.49 |
| internlm2-chat-20b-sft-hf | 73.70 | 85.46 | 76.70 | 78.09 | 89.64 | 76.85 | 72.17 | 84.88 | 50.00 | 56.56 | 66.99 | 75.17 |
| llama-3-8b-instruct-hf | 55.93 | 67.42 | 55.68 | 55.06 | 72.02 | 62.04 | 54.25 | 66.86 | 44.12 | 40.72 | 47.37 | 44.63 |
| llama-3-70b-instruct-hf | 71.11 | 84.21 | 74.43 | 73.03 | 84.97 | 80.56 | 69.81 | 78.49 | 57.35 | 50.68 | 57.89 | 64.43 |
| llama-3-8b-instruct-lmdeploy | 54.81 | 67.17 | 58.52 | 53.37 | 72.54 | 62.04 | 57.08 | 63.95 | 44.12 | 37.56 | 46.89 | 42.62 |
| llama-3-70b-instruct-lmdeploy | 70.37 | 82.96 | 72.16 | 71.91 | 83.94 | 82.41 | 69.34 | 77.91 | 55.39 | 50.68 | 56.46 | 64.09 |
| mistral-7b-instruct-v0.1-hf | 39.63 | 46.62 | 33.52 | 41.01 | 56.48 | 45.37 | 36.32 | 43.60 | 29.90 | 31.67 | 39.71 | 31.88 |
| mistral-7b-instruct-v0.2-hf | 46.30 | 54.39 | 39.20 | 43.26 | 61.66 | 51.85 | 35.38 | 55.23 | 28.92 | 35.29 | 37.80 | 29.19 |
| mixtral-8x7b-instruct-v0.1-hf | 58.52 | 66.17 | 56.82 | 57.30 | 66.32 | 62.04 | 48.11 | 66.28 | 41.67 | 37.10 | 46.41 | 35.91 |
| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
| qwen1.5-0.5b-chat-hf | 36.47 | 39.07 | 27.53 | 41.76 | 45.89 | 39.63 | 35.56 | 31.66 | 37.71 | 34.00 | 32.78 | 37.25 |
| qwen1.5-1.8b-chat-hf | 56.02 | 45.58 | 39.33 | 67.03 | 84.54 | 49.42 | 48.89 | 51.76 | 47.43 | 50.50 | 45.69 | 52.14 |
| qwen1.5-4b-chat-hf | 61.28 | 52.56 | 42.70 | 73.08 | 85.99 | 55.48 | 59.44 | 55.28 | 60.57 | 57.00 | 50.00 | 58.01 |
| qwen1.5-7b-chat-hf | 73.31 | 56.28 | 58.99 | 82.97 | 88.41 | 64.57 | 66.67 | 63.82 | 77.14 | 75.50 | 57.42 | 69.07 |
| qwen1.5-14b-chat-hf | 80.83 | 65.12 | 70.79 | 89.56 | 93.24 | 67.60 | 72.78 | 68.34 | 80.57 | 80.00 | 61.72 | 75.62 |
| qwen1.5-32b-chat-hf | 87.59 | 72.56 | 76.40 | 90.66 | 95.65 | 74.36 | 80.00 | 80.40 | 86.86 | 84.00 | 74.88 | 85.33 |
| qwen1.5-72b-chat-hf | 90.98 | 76.28 | 75.84 | 90.66 | 95.65 | 75.52 | 84.44 | 82.91 | 91.43 | 89.00 | 73.92 | 85.10 |
| qwen1.5-110b-chat-hf | 95.11 | 88.37 | 82.58 | 91.76 | 96.62 | 87.65 | 91.67 | 90.95 | 93.71 | 95.00 | 87.08 | 91.87 |
| internlm2-chat-1.8b-hf | 54.14 | 40.00 | 27.53 | 62.09 | 70.53 | 44.99 | 41.67 | 51.76 | 45.71 | 39.00 | 40.67 | 39.28 |
| internlm2-chat-1.8b-sft-hf | 54.14 | 42.33 | 26.97 | 61.54 | 71.98 | 45.45 | 41.67 | 50.25 | 45.14 | 37.50 | 41.39 | 40.63 |
| internlm2-chat-7b-hf | 70.68 | 44.19 | 34.83 | 73.63 | 84.06 | 51.98 | 57.22 | 68.34 | 66.86 | 57.50 | 54.55 | 50.11 |
| internlm2-chat-7b-sft-hf | 71.80 | 44.65 | 37.64 | 73.63 | 84.06 | 51.98 | 57.78 | 67.84 | 65.71 | 60.50 | 54.55 | 50.11 |
| internlm2-chat-20b-hf | 75.56 | 54.42 | 42.13 | 74.73 | 85.51 | 57.34 | 65.56 | 67.84 | 73.71 | 64.00 | 57.89 | 55.98 |
| internlm2-chat-20b-sft-hf | 76.32 | 55.35 | 41.01 | 75.27 | 85.51 | 58.28 | 65.56 | 67.34 | 72.57 | 65.00 | 58.37 | 56.43 |
| llama-3-8b-instruct-hf | 53.01 | 44.65 | 33.15 | 46.70 | 66.18 | 45.22 | 58.89 | 61.81 | 62.86 | 57.50 | 48.33 | 49.89 |
| llama-3-70b-instruct-hf | 71.43 | 50.70 | 30.90 | 71.43 | 82.13 | 59.67 | 73.33 | 73.37 | 82.86 | 82.00 | 59.09 | 62.08 |
| llama-3-8b-instruct-lmdeploy | 51.13 | 45.12 | 29.78 | 43.96 | 62.32 | 47.09 | 56.11 | 54.77 | 56.00 | 56.00 | 49.04 | 47.40 |
| llama-3-70b-instruct-lmdeploy | 68.80 | 48.84 | 30.90 | 70.88 | 81.64 | 58.28 | 72.22 | 70.85 | 80.00 | 81.00 | 57.66 | 62.53 |
| mistral-7b-instruct-v0.1-hf | 30.45 | 35.81 | 24.72 | 40.11 | 34.78 | 30.77 | 43.89 | 38.69 | 36.57 | 32.50 | 44.74 | 34.09 |
| mistral-7b-instruct-v0.2-hf | 36.09 | 38.14 | 23.03 | 43.41 | 45.41 | 35.90 | 50.00 | 41.71 | 42.86 | 36.00 | 45.22 | 42.21 |
| mixtral-8x7b-instruct-v0.1-hf | 47.37 | 44.65 | 30.34 | 51.65 | 60.87 | 42.19 | 53.89 | 58.29 | 52.00 | 47.00 | 48.56 | 44.02 |
| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
| qwen1.5-0.5b-chat-hf | 27.66 | 38.43 | 32.28 | 35.44 |
| qwen1.5-1.8b-chat-hf | 38.65 | 46.62 | 46.73 | 59.14 |
| qwen1.5-4b-chat-hf | 49.29 | 54.80 | 51.02 | 70.20 |
| qwen1.5-7b-chat-hf | 53.90 | 62.28 | 57.79 | 76.52 |
| qwen1.5-14b-chat-hf | 58.87 | 65.12 | 67.27 | 86.68 |
| qwen1.5-32b-chat-hf | 74.11 | 70.82 | 74.94 | 88.04 |
| qwen1.5-72b-chat-hf | 74.82 | 75.09 | 78.56 | 89.39 |
| qwen1.5-110b-chat-hf | 88.30 | 88.97 | 94.13 | 95.49 |
| internlm2-chat-1.8b-hf | 30.14 | 41.99 | 34.54 | 46.73 |
| internlm2-chat-1.8b-sft-hf | 30.14 | 43.06 | 34.31 | 47.86 |
| internlm2-chat-7b-hf | 42.20 | 52.31 | 47.63 | 66.82 |
| internlm2-chat-7b-sft-hf | 43.26 | 52.67 | 47.86 | 66.59 |
| internlm2-chat-20b-hf | 45.74 | 54.80 | 51.02 | 69.07 |
| internlm2-chat-20b-sft-hf | 45.74 | 55.16 | 51.02 | 68.62 |
| llama-3-8b-instruct-hf | 37.59 | 50.53 | 42.44 | 68.40 |
| llama-3-70b-instruct-hf | 50.71 | 64.06 | 55.53 | 84.42 |
| llama-3-8b-instruct-lmdeploy | 37.94 | 50.53 | 41.53 | 66.14 |
| llama-3-70b-instruct-lmdeploy | 48.94 | 63.70 | 53.95 | 81.72 |
| mistral-7b-instruct-v0.1-hf | 27.66 | 39.15 | 29.35 | 39.95 |
| mistral-7b-instruct-v0.2-hf | 32.27 | 37.01 | 32.96 | 42.89 |
| mixtral-8x7b-instruct-v0.1-hf | 36.88 | 48.75 | 41.76 | 53.05 |
### Details on Dev Split

View File

@ -0,0 +1,64 @@
# GPQA
## Base Models
| model | GPQA_diamond |
|:------------------------:|---------------:|
| llama-7b-turbomind | 24.24 |
| llama-13b-turbomind | 25.25 |
| llama-30b-turbomind | 22.73 |
| llama-65b-turbomind | 21.72 |
| llama-2-7b-turbomind | 25.25 |
| llama-2-13b-turbomind | 23.74 |
| llama-2-70b-turbomind | 28.28 |
| llama-3-8b-turbomind | 31.82 |
| llama-3-70b-turbomind | 40.91 |
| internlm2-1.8b-turbomind | 24.24 |
| internlm2-7b-turbomind | 28.28 |
| internlm2-20b-turbomind | 31.31 |
| qwen-1.8b-turbomind | 28.79 |
| qwen-7b-turbomind | 24.75 |
| qwen-14b-turbomind | 27.78 |
| qwen-72b-turbomind | 31.31 |
| qwen1.5-0.5b-hf | 23.74 |
| qwen1.5-1.8b-hf | 28.79 |
| qwen1.5-4b-hf | 23.23 |
| qwen1.5-7b-hf | 20.71 |
| qwen1.5-14b-hf | 32.32 |
| qwen1.5-32b-hf | 30.81 |
| qwen1.5-72b-hf | 31.82 |
| qwen1.5-moe-a2-7b-hf | 28.79 |
| mistral-7b-v0.1-hf | 24.75 |
| mistral-7b-v0.2-hf | 23.74 |
| mixtral-8x7b-v0.1-hf | 28.79 |
| mixtral-8x22b-v0.1-hf | 36.36 |
| yi-6b-hf | 28.28 |
| yi-34b-hf | 35.86 |
| deepseek-7b-base-hf | 20.71 |
| deepseek-67b-base-hf | 25.25 |
## Chat Models
| model | GPQA_diamond |
|:-----------------------------:|---------------:|
| qwen1.5-0.5b-chat-hf | 19.70 |
| qwen1.5-1.8b-chat-hf | 29.80 |
| qwen1.5-4b-chat-hf | 25.25 |
| qwen1.5-7b-chat-hf | 31.82 |
| qwen1.5-14b-chat-hf | 30.30 |
| qwen1.5-32b-chat-hf | 31.31 |
| qwen1.5-72b-chat-hf | 32.83 |
| qwen1.5-110b-chat-hf | 35.86 |
| internlm2-chat-1.8b-hf | 25.76 |
| internlm2-chat-1.8b-sft-hf | 26.26 |
| internlm2-chat-7b-hf | 28.28 |
| internlm2-chat-7b-sft-hf | 27.27 |
| internlm2-chat-20b-hf | 30.30 |
| internlm2-chat-20b-sft-hf | 29.29 |
| llama-3-8b-instruct-hf | 25.76 |
| llama-3-70b-instruct-hf | 37.88 |
| llama-3-8b-instruct-lmdeploy | 25.76 |
| llama-3-70b-instruct-lmdeploy | 37.88 |
| mistral-7b-instruct-v0.1-hf | 30.30 |
| mistral-7b-instruct-v0.2-hf | 25.25 |
| mixtral-8x7b-instruct-v0.1-hf | 30.30 |

View File

@ -0,0 +1,64 @@
# GSM8K
## Base Models
| model | gsm8k |
|:------------------------:|--------:|
| llama-7b-turbomind | 10.31 |
| llama-13b-turbomind | 20.55 |
| llama-30b-turbomind | 42.08 |
| llama-65b-turbomind | 54.81 |
| llama-2-7b-turbomind | 16.76 |
| llama-2-13b-turbomind | 29.87 |
| llama-2-70b-turbomind | 63.53 |
| llama-3-8b-turbomind | 54.28 |
| llama-3-70b-turbomind | 69.98 |
| internlm2-1.8b-turbomind | 30.40 |
| internlm2-7b-turbomind | 69.98 |
| internlm2-20b-turbomind | 76.80 |
| qwen-1.8b-turbomind | 23.73 |
| qwen-7b-turbomind | 54.36 |
| qwen-14b-turbomind | 61.64 |
| qwen-72b-turbomind | 79.68 |
| qwen1.5-0.5b-hf | 13.27 |
| qwen1.5-1.8b-hf | 34.87 |
| qwen1.5-4b-hf | 47.61 |
| qwen1.5-7b-hf | 54.36 |
| qwen1.5-14b-hf | 63.53 |
| qwen1.5-32b-hf | 72.71 |
| qwen1.5-72b-hf | 79.53 |
| qwen1.5-moe-a2-7b-hf | 61.26 |
| mistral-7b-v0.1-hf | 47.61 |
| mistral-7b-v0.2-hf | 45.19 |
| mixtral-8x7b-v0.1-hf | 66.26 |
| mixtral-8x22b-v0.1-hf | 82.87 |
| yi-6b-hf | 39.58 |
| yi-34b-hf | 67.70 |
| deepseek-7b-base-hf | 20.17 |
| deepseek-67b-base-hf | 68.16 |
## Chat Models
| model | gsm8k |
|:-----------------------------:|--------:|
| qwen1.5-0.5b-chat-hf | 8.79 |
| qwen1.5-1.8b-chat-hf | 27.60 |
| qwen1.5-4b-chat-hf | 47.61 |
| qwen1.5-7b-chat-hf | 56.25 |
| qwen1.5-14b-chat-hf | 64.90 |
| qwen1.5-32b-chat-hf | 79.91 |
| qwen1.5-72b-chat-hf | 77.03 |
| qwen1.5-110b-chat-hf | 79.53 |
| internlm2-chat-1.8b-hf | 39.73 |
| internlm2-chat-1.8b-sft-hf | 36.85 |
| internlm2-chat-7b-hf | 69.90 |
| internlm2-chat-7b-sft-hf | 69.83 |
| internlm2-chat-20b-hf | 75.21 |
| internlm2-chat-20b-sft-hf | 76.95 |
| llama-3-8b-instruct-hf | 79.53 |
| llama-3-70b-instruct-hf | 89.76 |
| llama-3-8b-instruct-lmdeploy | 78.77 |
| llama-3-70b-instruct-lmdeploy | 89.31 |
| mistral-7b-instruct-v0.1-hf | 42.23 |
| mistral-7b-instruct-v0.2-hf | 45.56 |
| mixtral-8x7b-instruct-v0.1-hf | 65.13 |

View File

@ -0,0 +1,64 @@
# HellaSwag
## Base Models
| model | hellaswag |
|:------------------------:|------------:|
| llama-7b-turbomind | 26.99 |
| llama-13b-turbomind | 34.21 |
| llama-30b-turbomind | 35.65 |
| llama-65b-turbomind | 44.63 |
| llama-2-7b-turbomind | 29.29 |
| llama-2-13b-turbomind | 45.06 |
| llama-2-70b-turbomind | 55.91 |
| llama-3-8b-turbomind | 50.86 |
| llama-3-70b-turbomind | 80.60 |
| internlm2-1.8b-turbomind | 44.86 |
| internlm2-7b-turbomind | 89.52 |
| internlm2-20b-turbomind | 91.41 |
| qwen-1.8b-turbomind | 38.04 |
| qwen-7b-turbomind | 64.62 |
| qwen-14b-turbomind | 85.88 |
| qwen-72b-turbomind | 90.40 |
| qwen1.5-0.5b-hf | 29.19 |
| qwen1.5-1.8b-hf | 42.32 |
| qwen1.5-4b-hf | 55.89 |
| qwen1.5-7b-hf | 68.51 |
| qwen1.5-14b-hf | 83.86 |
| qwen1.5-32b-hf | 87.28 |
| qwen1.5-72b-hf | 90.41 |
| qwen1.5-moe-a2-7b-hf | 72.42 |
| mistral-7b-v0.1-hf | 42.04 |
| mistral-7b-v0.2-hf | 46.24 |
| mixtral-8x7b-v0.1-hf | 66.22 |
| mixtral-8x22b-v0.1-hf | 79.66 |
| yi-6b-hf | 66.83 |
| yi-34b-hf | 83.83 |
| deepseek-7b-base-hf | 30.42 |
| deepseek-67b-base-hf | 70.75 |
## Chat Models
| model | hellaswag |
|:-----------------------------:|------------:|
| qwen1.5-0.5b-chat-hf | 29.60 |
| qwen1.5-1.8b-chat-hf | 41.71 |
| qwen1.5-4b-chat-hf | 60.45 |
| qwen1.5-7b-chat-hf | 71.58 |
| qwen1.5-14b-chat-hf | 79.70 |
| qwen1.5-32b-chat-hf | 88.56 |
| qwen1.5-72b-chat-hf | 89.37 |
| qwen1.5-110b-chat-hf | 91.11 |
| internlm2-chat-1.8b-hf | 60.47 |
| internlm2-chat-1.8b-sft-hf | 61.58 |
| internlm2-chat-7b-hf | 84.80 |
| internlm2-chat-7b-sft-hf | 85.21 |
| internlm2-chat-20b-hf | 88.48 |
| internlm2-chat-20b-sft-hf | 88.95 |
| llama-3-8b-instruct-hf | 74.39 |
| llama-3-70b-instruct-hf | 89.07 |
| llama-3-8b-instruct-lmdeploy | 73.31 |
| llama-3-70b-instruct-lmdeploy | 87.28 |
| mistral-7b-instruct-v0.1-hf | 53.00 |
| mistral-7b-instruct-v0.2-hf | 65.72 |
| mixtral-8x7b-instruct-v0.1-hf | 76.16 |

View File

@ -0,0 +1,64 @@
# HumanEval
## Base Models
| model | pass@1 |
|:------------------------:|---------:|
| llama-7b-turbomind | 12.80 |
| llama-13b-turbomind | 15.24 |
| llama-30b-turbomind | 9.15 |
| llama-65b-turbomind | 7.32 |
| llama-2-7b-turbomind | 14.02 |
| llama-2-13b-turbomind | 15.24 |
| llama-2-70b-turbomind | 15.24 |
| llama-3-8b-turbomind | 28.05 |
| llama-3-70b-turbomind | 28.05 |
| internlm2-1.8b-turbomind | 30.49 |
| internlm2-7b-turbomind | 48.17 |
| internlm2-20b-turbomind | 51.83 |
| qwen-1.8b-turbomind | 16.46 |
| qwen-7b-turbomind | 23.78 |
| qwen-14b-turbomind | 23.78 |
| qwen-72b-turbomind | 66.46 |
| qwen1.5-0.5b-hf | 8.54 |
| qwen1.5-1.8b-hf | 23.17 |
| qwen1.5-4b-hf | 41.46 |
| qwen1.5-7b-hf | 53.05 |
| qwen1.5-14b-hf | 57.32 |
| qwen1.5-32b-hf | 70.12 |
| qwen1.5-72b-hf | 65.85 |
| qwen1.5-moe-a2-7b-hf | 45.73 |
| mistral-7b-v0.1-hf | 14.02 |
| mistral-7b-v0.2-hf | 9.15 |
| mixtral-8x7b-v0.1-hf | 24.39 |
| mixtral-8x22b-v0.1-hf | 16.46 |
| yi-6b-hf | 14.63 |
| yi-34b-hf | 17.07 |
| deepseek-7b-base-hf | 18.29 |
| deepseek-67b-base-hf | 23.17 |
## Chat Models
| model | pass@1 |
|:-----------------------------:|---------:|
| qwen1.5-0.5b-chat-hf | 9.15 |
| qwen1.5-1.8b-chat-hf | 15.85 |
| qwen1.5-4b-chat-hf | 30.49 |
| qwen1.5-7b-chat-hf | 40.85 |
| qwen1.5-14b-chat-hf | 50.00 |
| qwen1.5-32b-chat-hf | 57.93 |
| qwen1.5-72b-chat-hf | 60.37 |
| qwen1.5-110b-chat-hf | 65.24 |
| internlm2-chat-1.8b-hf | 33.54 |
| internlm2-chat-1.8b-sft-hf | 34.15 |
| internlm2-chat-7b-hf | 56.71 |
| internlm2-chat-7b-sft-hf | 61.59 |
| internlm2-chat-20b-hf | 67.68 |
| internlm2-chat-20b-sft-hf | 67.68 |
| llama-3-8b-instruct-hf | 55.49 |
| llama-3-70b-instruct-hf | 70.73 |
| llama-3-8b-instruct-lmdeploy | 57.93 |
| llama-3-70b-instruct-lmdeploy | 70.73 |
| mistral-7b-instruct-v0.1-hf | 32.32 |
| mistral-7b-instruct-v0.2-hf | 29.27 |
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |

View File

@ -0,0 +1,64 @@
# MATH
## Base Models
| model | math |
|:------------------------:|-------:|
| llama-7b-turbomind | 2.94 |
| llama-13b-turbomind | 3.84 |
| llama-30b-turbomind | 6.54 |
| llama-65b-turbomind | 10.66 |
| llama-2-7b-turbomind | 3.58 |
| llama-2-13b-turbomind | 5.30 |
| llama-2-70b-turbomind | 13.26 |
| llama-3-8b-turbomind | 16.42 |
| llama-3-70b-turbomind | 39.64 |
| internlm2-1.8b-turbomind | 9.42 |
| internlm2-7b-turbomind | 25.16 |
| internlm2-20b-turbomind | 32.24 |
| qwen-1.8b-turbomind | 6.30 |
| qwen-7b-turbomind | 15.56 |
| qwen-14b-turbomind | 30.38 |
| qwen-72b-turbomind | 44.18 |
| qwen1.5-0.5b-hf | 4.16 |
| qwen1.5-1.8b-hf | 11.32 |
| qwen1.5-4b-hf | 17.50 |
| qwen1.5-7b-hf | 17.34 |
| qwen1.5-14b-hf | 36.18 |
| qwen1.5-32b-hf | 45.74 |
| qwen1.5-72b-hf | 41.56 |
| qwen1.5-moe-a2-7b-hf | 27.96 |
| mistral-7b-v0.1-hf | 13.44 |
| mistral-7b-v0.2-hf | 12.74 |
| mixtral-8x7b-v0.1-hf | 29.46 |
| mixtral-8x22b-v0.1-hf | 41.82 |
| yi-6b-hf | 6.60 |
| yi-34b-hf | 18.80 |
| deepseek-7b-base-hf | 4.66 |
| deepseek-67b-base-hf | 18.76 |
## Chat Models
| model | math |
|:-----------------------------:|-------:|
| qwen1.5-0.5b-chat-hf | 0.56 |
| qwen1.5-1.8b-chat-hf | 4.94 |
| qwen1.5-4b-chat-hf | 7.34 |
| qwen1.5-7b-chat-hf | 22.14 |
| qwen1.5-14b-chat-hf | 32.22 |
| qwen1.5-32b-chat-hf | 41.80 |
| qwen1.5-72b-chat-hf | 45.22 |
| qwen1.5-110b-chat-hf | 54.38 |
| internlm2-chat-1.8b-hf | 14.06 |
| internlm2-chat-1.8b-sft-hf | 13.10 |
| internlm2-chat-7b-hf | 28.08 |
| internlm2-chat-7b-sft-hf | 27.60 |
| internlm2-chat-20b-hf | 34.68 |
| internlm2-chat-20b-sft-hf | 32.54 |
| llama-3-8b-instruct-hf | 27.50 |
| llama-3-70b-instruct-hf | 47.52 |
| llama-3-8b-instruct-lmdeploy | 27.42 |
| llama-3-70b-instruct-lmdeploy | 46.90 |
| mistral-7b-instruct-v0.1-hf | 8.48 |
| mistral-7b-instruct-v0.2-hf | 10.82 |
| mixtral-8x7b-instruct-v0.1-hf | 27.02 |

View File

@ -0,0 +1,64 @@
# MBPP
## Base Models
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 |
| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 |
| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 |
| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 |
| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 |
| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 |
| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 |
| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 |
| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 |
| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 |
| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 |
| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 |
| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 |
| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 |
| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 |
| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 |
| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 |
| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 |
| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 |
| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 |
| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 |
| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 |
| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 |
| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 |
| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 |
| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 |
| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 |
| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 |
| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 |
| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 |
| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 |
| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 |
## Chat Models
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 |
| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 |
| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 |
| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 |
| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 |
| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 |
| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 |
| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 |
| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 |
| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 |
| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 |
| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 |
| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 |
| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 |
| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 |
| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 |
| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 |
| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 |
| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 |
| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 |
| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 |

View File

@ -0,0 +1,363 @@
# MMLU
## Base Models
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
### Details
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
## Chat Models
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
### Details
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |

View File

@ -0,0 +1,64 @@
# NQ
## Base Models
| model | nq |
|:------------------------:|------:|
| llama-7b-turbomind | 15.10 |
| llama-13b-turbomind | 16.43 |
| llama-30b-turbomind | 22.11 |
| llama-65b-turbomind | 26.09 |
| llama-2-7b-turbomind | 14.35 |
| llama-2-13b-turbomind | 21.69 |
| llama-2-70b-turbomind | 23.27 |
| llama-3-8b-turbomind | 18.78 |
| llama-3-70b-turbomind | 31.88 |
| internlm2-1.8b-turbomind | 20.66 |
| internlm2-7b-turbomind | 41.05 |
| internlm2-20b-turbomind | 43.55 |
| qwen-1.8b-turbomind | 5.68 |
| qwen-7b-turbomind | 17.87 |
| qwen-14b-turbomind | 13.77 |
| qwen-72b-turbomind | 18.20 |
| qwen1.5-0.5b-hf | 6.01 |
| qwen1.5-1.8b-hf | 10.28 |
| qwen1.5-4b-hf | 15.73 |
| qwen1.5-7b-hf | 18.61 |
| qwen1.5-14b-hf | 16.07 |
| qwen1.5-32b-hf | 21.75 |
| qwen1.5-72b-hf | 20.53 |
| qwen1.5-moe-a2-7b-hf | 16.62 |
| mistral-7b-v0.1-hf | 20.66 |
| mistral-7b-v0.2-hf | 20.78 |
| mixtral-8x7b-v0.1-hf | 24.85 |
| mixtral-8x22b-v0.1-hf | 34.43 |
| yi-6b-hf | 10.08 |
| yi-34b-hf | 13.96 |
| deepseek-7b-base-hf | 8.45 |
| deepseek-67b-base-hf | 17.59 |
## Chat Models
| model | nq |
|:-----------------------------:|------:|
| qwen1.5-0.5b-chat-hf | 7.42 |
| qwen1.5-1.8b-chat-hf | 10.22 |
| qwen1.5-4b-chat-hf | 19.31 |
| qwen1.5-7b-chat-hf | 16.87 |
| qwen1.5-14b-chat-hf | 20.53 |
| qwen1.5-32b-chat-hf | 25.26 |
| qwen1.5-72b-chat-hf | 35.21 |
| qwen1.5-110b-chat-hf | 36.98 |
| internlm2-chat-1.8b-hf | 19.09 |
| internlm2-chat-1.8b-sft-hf | 18.14 |
| internlm2-chat-7b-hf | 28.73 |
| internlm2-chat-7b-sft-hf | 30.78 |
| internlm2-chat-20b-hf | 28.75 |
| internlm2-chat-20b-sft-hf | 34.10 |
| llama-3-8b-instruct-hf | 30.17 |
| llama-3-70b-instruct-hf | 40.25 |
| llama-3-8b-instruct-lmdeploy | 28.28 |
| llama-3-70b-instruct-lmdeploy | 39.14 |
| mistral-7b-instruct-v0.1-hf | 22.47 |
| mistral-7b-instruct-v0.2-hf | 25.18 |
| mixtral-8x7b-instruct-v0.1-hf | 32.08 |

View File

@ -0,0 +1,64 @@
# RACE
## Base Models
| model | race-high | race-middle |
|:------------------------:|------------:|--------------:|
| llama-7b-turbomind | 31.30 | 29.53 |
| llama-13b-turbomind | 35.56 | 40.25 |
| llama-30b-turbomind | 57.35 | 55.78 |
| llama-65b-turbomind | 70.21 | 75.35 |
| llama-2-7b-turbomind | 39.74 | 46.73 |
| llama-2-13b-turbomind | 57.06 | 60.52 |
| llama-2-70b-turbomind | 79.02 | 82.17 |
| llama-3-8b-turbomind | 67.75 | 73.61 |
| llama-3-70b-turbomind | 85.79 | 90.25 |
| internlm2-1.8b-turbomind | 64.72 | 70.40 |
| internlm2-7b-turbomind | 72.56 | 74.16 |
| internlm2-20b-turbomind | 72.90 | 74.03 |
| qwen-1.8b-turbomind | 63.09 | 69.29 |
| qwen-7b-turbomind | 80.30 | 85.38 |
| qwen-14b-turbomind | 88.11 | 92.06 |
| qwen-72b-turbomind | 90.62 | 93.59 |
| qwen1.5-0.5b-hf | 54.66 | 60.38 |
| qwen1.5-1.8b-hf | 67.27 | 73.33 |
| qwen1.5-4b-hf | 78.50 | 83.29 |
| qwen1.5-7b-hf | 82.73 | 86.70 |
| qwen1.5-14b-hf | 87.99 | 91.85 |
| qwen1.5-32b-hf | 90.57 | 93.25 |
| qwen1.5-72b-hf | 90.45 | 93.87 |
| qwen1.5-moe-a2-7b-hf | 79.56 | 83.57 |
| mistral-7b-v0.1-hf | 73.58 | 76.25 |
| mistral-7b-v0.2-hf | 73.67 | 77.09 |
| mixtral-8x7b-v0.1-hf | 80.13 | 84.61 |
| mixtral-8x22b-v0.1-hf | 86.56 | 89.62 |
| yi-6b-hf | 82.93 | 85.72 |
| yi-34b-hf | 90.94 | 92.76 |
| deepseek-7b-base-hf | 50.91 | 56.82 |
| deepseek-67b-base-hf | 83.53 | 88.23 |
## Chat Models
| model | race-high | race-middle |
|:-----------------------------:|------------:|--------------:|
| qwen1.5-0.5b-chat-hf | 49.03 | 52.79 |
| qwen1.5-1.8b-chat-hf | 66.24 | 72.91 |
| qwen1.5-4b-chat-hf | 73.53 | 80.29 |
| qwen1.5-7b-chat-hf | 83.28 | 88.09 |
| qwen1.5-14b-chat-hf | 87.51 | 91.36 |
| qwen1.5-32b-chat-hf | 91.22 | 93.52 |
| qwen1.5-72b-chat-hf | 91.11 | 93.38 |
| qwen1.5-110b-chat-hf | 92.31 | 93.66 |
| internlm2-chat-1.8b-hf | 73.87 | 81.13 |
| internlm2-chat-1.8b-sft-hf | 73.81 | 81.69 |
| internlm2-chat-7b-hf | 84.51 | 88.72 |
| internlm2-chat-7b-sft-hf | 84.88 | 89.90 |
| internlm2-chat-20b-hf | 88.02 | 91.43 |
| internlm2-chat-20b-sft-hf | 88.11 | 91.57 |
| llama-3-8b-instruct-hf | 81.22 | 86.63 |
| llama-3-70b-instruct-hf | 89.57 | 93.45 |
| llama-3-8b-instruct-lmdeploy | 81.02 | 86.14 |
| llama-3-70b-instruct-lmdeploy | 89.34 | 93.25 |
| mistral-7b-instruct-v0.1-hf | 69.75 | 74.72 |
| mistral-7b-instruct-v0.2-hf | 73.30 | 77.58 |
| mixtral-8x7b-instruct-v0.1-hf | 81.88 | 87.26 |

View File

@ -0,0 +1,64 @@
# TriviaQA
## Base Models
| model | triviaqa |
|:------------------------:|-----------:|
| llama-7b-turbomind | 40.39 |
| llama-13b-turbomind | 66.41 |
| llama-30b-turbomind | 75.90 |
| llama-65b-turbomind | 82.26 |
| llama-2-7b-turbomind | 43.21 |
| llama-2-13b-turbomind | 71.32 |
| llama-2-70b-turbomind | 67.45 |
| llama-3-8b-turbomind | 71.24 |
| llama-3-70b-turbomind | 88.16 |
| internlm2-1.8b-turbomind | 38.42 |
| internlm2-7b-turbomind | 69.15 |
| internlm2-20b-turbomind | 74.03 |
| qwen-1.8b-turbomind | 22.76 |
| qwen-7b-turbomind | 53.61 |
| qwen-14b-turbomind | 49.72 |
| qwen-72b-turbomind | 79.13 |
| qwen1.5-0.5b-hf | 21.24 |
| qwen1.5-1.8b-hf | 34.32 |
| qwen1.5-4b-hf | 44.59 |
| qwen1.5-7b-hf | 56.60 |
| qwen1.5-14b-hf | 59.96 |
| qwen1.5-32b-hf | 56.20 |
| qwen1.5-72b-hf | 77.81 |
| qwen1.5-moe-a2-7b-hf | 65.49 |
| mistral-7b-v0.1-hf | 72.93 |
| mistral-7b-v0.2-hf | 70.91 |
| mixtral-8x7b-v0.1-hf | 85.05 |
| mixtral-8x22b-v0.1-hf | 89.47 |
| yi-6b-hf | 23.76 |
| yi-34b-hf | 14.73 |
| deepseek-7b-base-hf | 59.48 |
| deepseek-67b-base-hf | 72.15 |
## Chat Models
| model | triviaqa |
|:-----------------------------:|-----------:|
| qwen1.5-0.5b-chat-hf | 19.84 |
| qwen1.5-1.8b-chat-hf | 35.81 |
| qwen1.5-4b-chat-hf | 48.93 |
| qwen1.5-7b-chat-hf | 53.65 |
| qwen1.5-14b-chat-hf | 62.58 |
| qwen1.5-32b-chat-hf | 74.72 |
| qwen1.5-72b-chat-hf | 83.25 |
| qwen1.5-110b-chat-hf | 86.20 |
| internlm2-chat-1.8b-hf | 46.69 |
| internlm2-chat-1.8b-sft-hf | 46.50 |
| internlm2-chat-7b-hf | 69.54 |
| internlm2-chat-7b-sft-hf | 70.75 |
| internlm2-chat-20b-hf | 75.53 |
| internlm2-chat-20b-sft-hf | 75.90 |
| llama-3-8b-instruct-hf | 78.99 |
| llama-3-70b-instruct-hf | 89.79 |
| llama-3-8b-instruct-lmdeploy | 76.77 |
| llama-3-70b-instruct-lmdeploy | 89.62 |
| mistral-7b-instruct-v0.1-hf | 62.94 |
| mistral-7b-instruct-v0.2-hf | 67.72 |
| mixtral-8x7b-instruct-v0.1-hf | 79.57 |

View File

@ -0,0 +1,64 @@
# WinoGrande
## Base Models
| model | winogrande |
|:------------------------:|-------------:|
| llama-7b-turbomind | 71.19 |
| llama-13b-turbomind | 76.16 |
| llama-30b-turbomind | 80.66 |
| llama-65b-turbomind | 82.16 |
| llama-2-7b-turbomind | 74.03 |
| llama-2-13b-turbomind | 76.48 |
| llama-2-70b-turbomind | 83.98 |
| llama-3-8b-turbomind | 77.82 |
| llama-3-70b-turbomind | 83.43 |
| internlm2-1.8b-turbomind | 66.77 |
| internlm2-7b-turbomind | 83.50 |
| internlm2-20b-turbomind | 84.69 |
| qwen-1.8b-turbomind | 61.25 |
| qwen-7b-turbomind | 72.06 |
| qwen-14b-turbomind | 72.45 |
| qwen-72b-turbomind | 82.56 |
| qwen1.5-0.5b-hf | 57.38 |
| qwen1.5-1.8b-hf | 60.46 |
| qwen1.5-4b-hf | 65.90 |
| qwen1.5-7b-hf | 70.01 |
| qwen1.5-14b-hf | 72.93 |
| qwen1.5-32b-hf | 78.69 |
| qwen1.5-72b-hf | 80.74 |
| qwen1.5-moe-a2-7b-hf | 71.43 |
| mistral-7b-v0.1-hf | 78.30 |
| mistral-7b-v0.2-hf | 77.51 |
| mixtral-8x7b-v0.1-hf | 81.53 |
| mixtral-8x22b-v0.1-hf | 86.50 |
| yi-6b-hf | 74.35 |
| yi-34b-hf | 79.01 |
| deepseek-7b-base-hf | 74.11 |
| deepseek-67b-base-hf | 79.32 |
## Chat Models
| model | winogrande |
|:-----------------------------:|-------------:|
| qwen1.5-0.5b-chat-hf | 50.51 |
| qwen1.5-1.8b-chat-hf | 51.07 |
| qwen1.5-4b-chat-hf | 57.54 |
| qwen1.5-7b-chat-hf | 65.27 |
| qwen1.5-14b-chat-hf | 70.09 |
| qwen1.5-32b-chat-hf | 77.90 |
| qwen1.5-72b-chat-hf | 80.82 |
| qwen1.5-110b-chat-hf | 82.32 |
| internlm2-chat-1.8b-hf | 57.62 |
| internlm2-chat-1.8b-sft-hf | 57.93 |
| internlm2-chat-7b-hf | 73.56 |
| internlm2-chat-7b-sft-hf | 73.80 |
| internlm2-chat-20b-hf | 81.06 |
| internlm2-chat-20b-sft-hf | 81.37 |
| llama-3-8b-instruct-hf | 66.22 |
| llama-3-70b-instruct-hf | 81.29 |
| llama-3-8b-instruct-lmdeploy | 66.93 |
| llama-3-70b-instruct-lmdeploy | 81.22 |
| mistral-7b-instruct-v0.1-hf | 58.56 |
| mistral-7b-instruct-v0.2-hf | 59.43 |
| mixtral-8x7b-instruct-v0.1-hf | 65.75 |

View File

@ -1,7 +1,7 @@
from mmengine.config import read_base
with read_base():
from .datasets.CHARM.charm_rea_gen_f8fca2 import charm_rea_datasets as datasets
from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model
# from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
@ -34,19 +34,19 @@ work_dir = './outputs/CHARM/chat/'
# dataset version metric mode internlm2-chat-7b-turbomind
# ------------------------------------------------------------- --------- ------------- ------ -----------------------------
# charm-rea-Direct - naive_average gen 49.51
# charm-rea-ZH-CoT - naive_average gen 61.33
# charm-rea-EN-CoT - naive_average gen 54.55
# charm-rea-XLT - naive_average gen 58.46
# charm-rea-Translate-EN - naive_average gen 56.15
# charm-reason-Direct - naive_average gen 49.51
# charm-reason-ZH-CoT - naive_average gen 61.33
# charm-reason-EN-CoT - naive_average gen 54.55
# charm-reason-XLT - naive_average gen 58.46
# charm-reason-Translate-EN - naive_average gen 56.15
# - - - -
# charm-rea-Chinese_Direct - naive_average gen 47.14
# charm-rea-Chinese_ZH-CoT - naive_average gen 58.40
# charm-rea-Chinese_EN-CoT - naive_average gen 48.31
# charm-rea-Chinese_XLT - naive_average gen 53.57
# charm-rea-Chinese_Translate-EN - naive_average gen 48.21
# charm-rea-Global_Direct - naive_average gen 51.88
# charm-rea-Global_ZH-CoT - naive_average gen 64.26
# charm-rea-Global_EN-CoT - naive_average gen 60.79
# charm-rea-Global_XLT - naive_average gen 63.36
# charm-rea-Global_Translate-EN - naive_average gen 64.10
# charm-reason-Chinese_Direct - naive_average gen 47.14
# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
# charm-reason-Chinese_EN-CoT - naive_average gen 48.31
# charm-reason-Chinese_XLT - naive_average gen 53.57
# charm-reason-Chinese_Translate-EN - naive_average gen 48.21
# charm-reason-Global_Direct - naive_average gen 51.88
# charm-reason-Global_ZH-CoT - naive_average gen 64.26
# charm-reason-Global_EN-CoT - naive_average gen 60.79
# charm-reason-Global_XLT - naive_average gen 63.36
# charm-reason-Global_Translate-EN - naive_average gen 64.10

View File

@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -5,7 +5,7 @@ with read_base():
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -4,7 +4,7 @@ with read_base():
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -6,7 +6,6 @@ with read_base():
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -3,7 +3,7 @@ from mmengine.config import read_base
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner

View File

@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner

View File

@ -0,0 +1,124 @@
# InternLM Model Details
## InternLM
InternLM is pre-trained on a large corpora with 1.6T tokens with a multi-phase progressive process, and then fine-tuned to align with human preferences. We also developed a training system called Uniscale-LLM for efficient large language model training. The evaluation on a number of benchmarks shows that InternLM achieves state-of-the-art performance in multiple aspects, including knowledge understanding, reading comprehension, mathematics, and coding. With such well-rounded capabilities, InternLM achieves outstanding performances on comprehensive exams, including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without resorting to external tools. On these benchmarks, InternLM not only significantly outperforms open-source models, but also obtains superior performance compared to ChatGPT. Also, InternLM demonstrates excellent capability of understanding Chinese language and Chinese culture, which makes it a suitable foundation model to support Chinese-oriented language applications.
## InternLM2
The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k "Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution.
# Evaluation Command
## Base Models
```bash
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
python3 run.py --models hf_internlm2_7b --datasets cmmlu_ppl_041cbf --debug
python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug
python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
```
## Chat Models
```bash
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
python3 run.py --models hf_internlm2_chat_7b --datasets cmmlu_gen_c13365 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
python3 run.py --models hf_internlm2_chat_7b --datasets lcbench_gen_5ff288 --debug
python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
```
# Benchmarks
We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
## Base Models
| dataset | internlm2-1.8b-turbomind | internlm2-7b-turbomind | internlm2-20b-turbomind |
|:------------:|---------------------------:|-------------------------:|--------------------------:|
| mmlu | 45.99 | 65.84 | 67.58 |
| cmmlu | 45.27 | 66.17 | 68.28 |
| ceval-test | 44.79 | 63.54 | 67.28 |
| GaokaoBench | 23.78 | 41.41 | 58.99 |
| triviaqa | 38.42 | 69.15 | 74.03 |
| nq | 20.66 | 41.05 | 43.55 |
| race-high | 64.72 | 72.56 | 72.90 |
| winogrande | 66.77 | 83.50 | 84.69 |
| hellaswag | 44.86 | 89.52 | 91.41 |
| bbh | 36.03 | 63.56 | 71.29 |
| gsm8k | 30.40 | 69.98 | 76.80 |
| math | 9.42 | 25.16 | 32.24 |
| TheoremQA | 10.50 | 21.88 | 26.00 |
| humaneval | 30.49 | 48.17 | 51.83 |
| mbpp | 30.74 | 54.47 | 59.92 |
| lcbench | 4.34 | 12.16 | 18.46 |
| GPQA_diamond | 24.24 | 28.28 | 31.31 |
## Chat Models
| dataset | internlm2-chat-1.8b-hf | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-hf | internlm2-chat-20b-sft-hf |
|:------------:|-------------------------:|-----------------------------:|-----------------------:|---------------------------:|------------------------:|----------------------------:|
| mmlu | 47.58 | 47.44 | 63.05 | 63.33 | 67.37 | 67.34 |
| cmmlu | 46.11 | 46.27 | 62.10 | 62.38 | 66.26 | 66.39 |
| ceval-test | 47.04 | 47.19 | 58.75 | 58.96 | 63.12 | 63.16 |
| GaokaoBench | 29.73 | 28.79 | 54.54 | 55.39 | 57.95 | 57.62 |
| triviaqa | 46.69 | 46.50 | 69.54 | 70.75 | 75.53 | 75.90 |
| nq | 19.09 | 18.14 | 28.73 | 30.78 | 28.75 | 34.10 |
| race-high | 73.87 | 73.81 | 84.51 | 84.88 | 88.02 | 88.11 |
| winogrande | 57.62 | 57.93 | 73.56 | 73.80 | 81.06 | 81.37 |
| hellaswag | 60.47 | 61.58 | 84.80 | 85.21 | 88.48 | 88.95 |
| bbh | 37.69 | 37.12 | 57.83 | 57.19 | 68.24 | 69.38 |
| gsm8k | 39.73 | 36.85 | 69.90 | 69.83 | 75.21 | 76.95 |
| math | 14.06 | 13.10 | 28.08 | 27.60 | 34.68 | 32.54 |
| TheoremQA | 13.63 | 12.88 | 18.50 | 18.75 | 23.00 | 25.12 |
| humaneval | 33.54 | 34.15 | 56.71 | 61.59 | 67.68 | 67.68 |
| mbpp | 39.69 | 36.19 | 57.59 | 55.64 | 68.87 | 69.65 |
| lcbench | 4.52 | 3.56 | 14.60 | 14.34 | 19.64 | 20.55 |
| GPQA_diamond | 25.76 | 26.26 | 28.28 | 27.27 | 30.30 | 29.29 |
| IFEval | 18.30 | 18.67 | 34.75 | 39.19 | 36.41 | 44.55 |
# Citation
```BibTeX
@misc{2023internlm,
title={InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities},
author={InternLM Team},
howpublished = {\url{https://github.com/InternLM/InternLM-techreport}},
year={2023}
}
@misc{cai2024internlm2,
title={InternLM2 Technical Report},
author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
year={2024},
eprint={2403.17297},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```

View File

@ -0,0 +1,23 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2-chat-1.8b-turbomind',
path='internlm/internlm2-chat-1_8b',
engine_config=dict(
max_batch_size=16,
tp=1,
),
gen_config=dict(
top_k=1,
temperature=1e-6,
top_p=0.9,
),
max_seq_len=2048,
max_out_len=1024,
batch_size=32768,
run_cfg=dict(num_gpus=1),
stop_words=['</s>', '<|im_end|>'],
)
]

View File

@ -3,7 +3,9 @@ from opencompass.models import TurboMindModel
settings = [
('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
('internlm2-base-7b-turbomind', 'internlm/internlm2-base-7b', 1),
('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
('internlm2-base-20b-turbomind', 'internlm/internlm2-base-20b', 2),
]
models = []

View File

@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-13b-hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
run_cfg=dict(num_gpus=2),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-13b-chat-hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
run_cfg=dict(num_gpus=2),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-70b-hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='meta-llama/Meta-Llama-3-70B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-13b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
run_cfg=dict(num_gpus=2),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-30b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=2),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-65b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='mistral-7b-instruct-v0.3-hf',
path='mistralai/Mistral-7B-Instruct-v0.3',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,13 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='mistral-7b-v0.3-hf',
path='mistralai/Mistral-7B-v0.3',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,142 @@
# Qwen Model Details
## Qwen
Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
## Qwen1.5
Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. In comparison with the previous released Qwen, the improvements include:
- 8 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, 32B and 72B dense models, and an MoE model of 14B with 2.7B activated;
- Significant performance improvement in human preference for chat models;
- Multilingual support of both base and chat models;
- Stable support of 32K context length for models of all sizes
- No need of trust_remote_code.
# Evaluation Command
## Base Models
```bash
python3 run.py --models hf_qwen1_5_7b --datasets mmlu_ppl_ac766d --debug
python3 run.py --models hf_qwen1_5_7b --datasets cmmlu_ppl_041cbf --debug
python3 run.py --models hf_qwen1_5_7b --datasets ceval_internal_ppl_93e5ce --debug
python3 run.py --models hf_qwen1_5_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
python3 run.py --models hf_qwen1_5_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
python3 run.py --models hf_qwen1_5_7b --datasets nq_open_1shot_gen_20a989 --debug
python3 run.py --models hf_qwen1_5_7b --datasets race_ppl_abed12 --debug
python3 run.py --models hf_qwen1_5_7b --datasets winogrande_5shot_ll_252f01 --debug
python3 run.py --models hf_qwen1_5_7b --datasets hellaswag_10shot_ppl_59c85e --debug
python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug
python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug
python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug
python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug
python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug
python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug
python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug
```
## Chat Models
```bash
python3 run.py --models hf_qwen1_5_7b_chat --datasets mmlu_gen_4d595a --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets cmmlu_gen_c13365 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets ceval_internal_gen_2daf24 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets GaokaoBench_no_subjective_gen_4c31db --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets nq_open_1shot_gen_01cf41 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets race_gen_69ee4f --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets winogrande_5shot_gen_b36770 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets hellaswag_10shot_gen_e42710 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets bbh_gen_5b92b0 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets gsm8k_gen_1d7fe4 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets math_0shot_gen_393424 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets TheoremQA_5shot_gen_6f0af8 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets humaneval_gen_8e312c --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets lcbench_gen_5ff288 --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets gpqa_gen_4baadb --debug
python3 run.py --models hf_qwen1_5_7b_chat --datasets IFEval_gen_3321a3 --debug
```
# Benchmarks
We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
## Base Models
| dataset | qwen-1.8b-turbomind | qwen-7b-turbomind | qwen-14b-turbomind | qwen-72b-turbomind |
|:------------:|----------------------:|--------------------:|---------------------:|---------------------:|
| mmlu | 46.61 | 59.75 | 67.85 | 77.36 |
| cmmlu | 51.98 | 62.10 | 70.05 | 83.32 |
| ceval-test | 54.24 | 62.06 | 70.33 | 83.25 |
| GaokaoBench | 22.11 | 35.32 | 54.07 | 77.56 |
| triviaqa | 22.76 | 53.61 | 49.72 | 79.13 |
| nq | 5.68 | 17.87 | 13.77 | 18.20 |
| race-high | 63.09 | 80.30 | 88.11 | 90.62 |
| winogrande | 61.25 | 72.06 | 72.45 | 82.56 |
| hellaswag | 38.04 | 64.62 | 85.88 | 90.40 |
| bbh | 22.53 | 45.89 | 56.75 | 63.35 |
| gsm8k | 23.73 | 54.36 | 61.64 | 79.68 |
| math | 6.30 | 15.56 | 30.38 | 44.18 |
| TheoremQA | 9.38 | 15.00 | 21.62 | 27.12 |
| humaneval | 16.46 | 23.78 | 23.78 | 66.46 |
| mbpp | 2.72 | 46.69 | 55.64 | 65.76 |
| lcbench | 1.82 | 4.95 | 8.86 | 16.86 |
| GPQA_diamond | 28.79 | 24.75 | 27.78 | 31.31 |
| dataset | qwen1.5-0.5b-hf | qwen1.5-1.8b-hf | qwen1.5-4b-hf | qwen1.5-7b-hf | qwen1.5-14b-hf | qwen1.5-32b-hf | qwen1.5-72b-hf |
|:------------:|------------------:|------------------:|----------------:|----------------:|-----------------:|-----------------:|-----------------:|
| mmlu | 39.98 | 47.14 | 57.03 | 62.15 | 69.10 | 73.88 | 77.02 |
| cmmlu | 46.05 | 57.45 | 66.38 | 71.86 | 76.95 | 81.58 | 83.00 |
| ceval-test | 48.36 | 58.67 | 66.55 | 72.49 | 76.93 | 82.50 | 83.03 |
| GaokaoBench | 30.67 | 35.66 | 54.31 | 65.99 | 66.60 | 79.01 | 80.26 |
| triviaqa | 21.24 | 34.32 | 44.59 | 56.60 | 59.96 | 56.20 | 77.81 |
| nq | 6.01 | 10.28 | 15.73 | 18.61 | 16.07 | 21.75 | 20.53 |
| race-high | 54.66 | 67.27 | 78.50 | 82.73 | 87.99 | 90.57 | 90.45 |
| winogrande | 57.38 | 60.46 | 65.90 | 70.01 | 72.93 | 78.69 | 80.74 |
| hellaswag | 29.19 | 42.32 | 55.89 | 68.51 | 83.86 | 87.28 | 90.41 |
| bbh | 20.54 | 27.01 | 34.81 | 39.87 | 50.38 | 67.47 | 58.81 |
| gsm8k | 13.27 | 34.87 | 47.61 | 54.36 | 63.53 | 72.71 | 79.53 |
| math | 4.16 | 11.32 | 17.50 | 17.34 | 36.18 | 45.74 | 41.56 |
| TheoremQA | 5.88 | 12.00 | 13.75 | 4.25 | 12.62 | 26.62 | 26.62 |
| humaneval | 8.54 | 23.17 | 41.46 | 53.05 | 57.32 | 70.12 | 65.85 |
| mbpp | 5.06 | 15.95 | 45.91 | 52.14 | 52.14 | 59.14 | 61.09 |
| lcbench | 0.87 | 2.00 | 5.65 | 6.69 | 12.69 | 14.34 | 15.29 |
| GPQA_diamond | 23.74 | 28.79 | 23.23 | 20.71 | 32.32 | 30.81 | 31.82 |
## Chat Models
| dataset | qwen1.5-0.5b-chat-hf | qwen1.5-1.8b-chat-hf | qwen1.5-4b-chat-hf | qwen1.5-7b-chat-hf | qwen1.5-14b-chat-hf | qwen1.5-32b-chat-hf | qwen1.5-72b-chat-hf | qwen1.5-110b-chat-hf |
|:------------:|-----------------------:|-----------------------:|---------------------:|---------------------:|----------------------:|----------------------:|----------------------:|-----------------------:|
| mmlu | 35.32 | 45.62 | 55.90 | 61.79 | 67.96 | 75.36 | 77.24 | 77.95 |
| cmmlu | 31.55 | 48.93 | 58.53 | 68.78 | 75.07 | 80.39 | 82.48 | 86.46 |
| ceval-test | 36.88 | 55.17 | 61.54 | 68.71 | 74.80 | 80.47 | 81.53 | 87.33 |
| GaokaoBench | 21.51 | 46.19 | 59.11 | 70.55 | 80.39 | 86.15 | 88.58 | 89.59 |
| triviaqa | 19.84 | 35.81 | 48.93 | 53.65 | 62.58 | 74.72 | 83.25 | 86.20 |
| nq | 7.42 | 10.22 | 19.31 | 16.87 | 20.53 | 25.26 | 35.21 | 36.98 |
| race-high | 49.03 | 66.24 | 73.53 | 83.28 | 87.51 | 91.22 | 91.11 | 92.31 |
| winogrande | 50.51 | 51.07 | 57.54 | 65.27 | 70.09 | 77.90 | 80.82 | 82.32 |
| hellaswag | 29.60 | 41.71 | 60.45 | 71.58 | 79.70 | 88.56 | 89.37 | 91.11 |
| bbh | 24.12 | 26.82 | 43.15 | 38.12 | 55.38 | 69.28 | 72.97 | 71.04 |
| gsm8k | 8.79 | 27.60 | 47.61 | 56.25 | 64.90 | 79.91 | 77.03 | 79.53 |
| math | 0.56 | 4.94 | 7.34 | 22.14 | 32.22 | 41.80 | 45.22 | 54.38 |
| TheoremQA | 9.00 | 9.25 | 13.88 | 12.25 | 13.63 | 19.25 | 22.75 | 17.50 |
| humaneval | 9.15 | 15.85 | 30.49 | 40.85 | 50.00 | 57.93 | 60.37 | 65.24 |
| mbpp | 11.28 | 22.57 | 43.58 | 50.58 | 56.03 | 65.37 | 66.93 | 68.48 |
| lcbench | 0.00 | 1.65 | 5.56 | 8.78 | 14.42 | 10.78 | 18.77 | 34.58 |
| GPQA_diamond | 19.70 | 29.80 | 25.25 | 31.82 | 30.30 | 31.31 | 32.83 | 35.86 |
| IFEval | 13.12 | 16.08 | 25.51 | 38.82 | 42.51 | 49.54 | 51.02 | 55.08 |
# Citation
```BibTeX
@article{qwen,
title={Qwen Technical Report},
author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
journal={arXiv preprint arXiv:2309.16609},
year={2023}
}
```

View File

@ -1,25 +1,12 @@
from opencompass.models import HuggingFaceCausalLM
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen1.5-moe-a2-7b-hf',
type=HuggingFaceBaseModel,
abbr='qwen1.5-moe-a2.7b-hf',
path='Qwen/Qwen1.5-MoE-A2.7B',
tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151645,
max_out_len=100,
max_seq_len=2048,
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=1),
)
]

View File

@ -1,33 +1,12 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
],
)
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen1.5-moe-a2-7b-chat-hf',
type=HuggingFacewithChatTemplate,
abbr='qwen1.5-moe-a2.7b-chat-hf',
path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>',
batch_padding=True,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen2-0.5b-hf',
path='Qwen/Qwen2-0.5B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen2-1.5b-hf',
path='Qwen/Qwen2-1.5B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen2-72b-hf',
path='Qwen/Qwen2-72B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen2-7b-hf',
path='Qwen/Qwen2-7B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen2-moe-57b-a14b-hf',
path='Qwen/Qwen2-MoE-57B-A14B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -7,6 +7,6 @@ models = [
path='Qwen/Qwen-14B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
run_cfg=dict(num_gpus=2),
)
]

View File

@ -1,12 +1,31 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFacewithChatTemplate,
type=HuggingFaceCausalLM,
abbr='qwen-14b-chat-hf',
path='Qwen/Qwen-14B-Chat',
model_kwargs=dict(device_map='auto', trust_remote_code=True),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151643,
max_out_len=1024,
max_seq_len=8192,
batch_size=8,
run_cfg=dict(num_gpus=1),
batch_padding=True,
meta_template=_meta_template,
run_cfg=dict(num_gpus=2),
end_str='<|im_end|>',
)
]

View File

@ -1,12 +1,31 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFacewithChatTemplate,
type=HuggingFaceCausalLM,
abbr='qwen-1.8b-chat-hf',
path='Qwen/Qwen-1_8B-Chat',
model_kwargs=dict(device_map='auto', trust_remote_code=True),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151643,
max_out_len=1024,
max_seq_len=8192,
batch_size=8,
batch_padding=True,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1),
end_str='<|im_end|>',
)
]

View File

@ -7,6 +7,6 @@ models = [
path='Qwen/Qwen-72B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -1,12 +1,31 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFacewithChatTemplate,
type=HuggingFaceCausalLM,
abbr='qwen-72b-chat-hf',
path='Qwen/Qwen-72B-Chat',
model_kwargs=dict(device_map='auto', trust_remote_code=True),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151643,
max_out_len=1024,
max_seq_len=8192,
batch_size=8,
run_cfg=dict(num_gpus=4),
batch_padding=True,
meta_template=_meta_template,
run_cfg=dict(num_gpus=8),
end_str='<|im_end|>',
)
]

View File

@ -1,12 +1,31 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFacewithChatTemplate,
type=HuggingFaceCausalLM,
abbr='qwen-7b-chat-hf',
path='Qwen/Qwen-7B-Chat',
model_kwargs=dict(device_map='auto', trust_remote_code=True),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151643,
max_out_len=1024,
max_seq_len=8192,
batch_size=8,
batch_padding=True,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1),
end_str='<|im_end|>',
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='yi-1.5-34b-hf',
path='01-ai/Yi-1.5-34B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='yi-1.5-6b-hf',
path='01-ai/Yi-1.5-6B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='yi-1.5-9b-hf',
path='01-ai/Yi-1.5-9B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -1,5 +1,5 @@
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@ -44,7 +44,7 @@ models = [
judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='',
meta_template=api_meta_template,
@ -70,7 +70,7 @@ runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalT
gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAIAllesAPIN,
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,

View File

@ -1,98 +0,0 @@
from mmengine.config import read_base
with read_base():
from .groups.charm_rea import charm_rea_summary_groups
summarizer = dict(
dataset_abbrs=[
'charm-rea-Direct',
'charm-rea-ZH-CoT',
'charm-rea-EN-CoT',
'charm-rea-XLT',
'charm-rea-Translate-EN',
'',
'charm-rea-Chinese_Direct',
'charm-rea-Chinese_ZH-CoT',
'charm-rea-Chinese_EN-CoT',
'charm-rea-Chinese_XLT',
'charm-rea-Chinese_Translate-EN',
'charm-rea-Global_Direct',
'charm-rea-Global_ZH-CoT',
'charm-rea-Global_EN-CoT',
'charm-rea-Global_XLT',
'charm-rea-Global_Translate-EN',
'',
'charm-rea-Chinese_Anachronisms_Judgment_Direct',
'charm-rea-Chinese_Movie_and_Music_Recommendation_Direct',
'charm-rea-Chinese_Natural_Language_Inference_Direct',
'charm-rea-Chinese_Reading_Comprehension_Direct',
'charm-rea-Chinese_Sequence_Understanding_Direct',
'charm-rea-Chinese_Sport_Understanding_Direct',
'charm-rea-Chinese_Time_Understanding_Direct',
'charm-rea-Global_Anachronisms_Judgment_Direct',
'charm-rea-Global_Movie_and_Music_Recommendation_Direct',
'charm-rea-Global_Natural_Language_Inference_Direct',
'charm-rea-Global_Reading_Comprehension_Direct',
'charm-rea-Global_Sequence_Understanding_Direct',
'charm-rea-Global_Sport_Understanding_Direct',
'charm-rea-Global_Time_Understanding_Direct',
'charm-rea-Chinese_Anachronisms_Judgment_ZH-CoT',
'charm-rea-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
'charm-rea-Chinese_Natural_Language_Inference_ZH-CoT',
'charm-rea-Chinese_Reading_Comprehension_ZH-CoT',
'charm-rea-Chinese_Sequence_Understanding_ZH-CoT',
'charm-rea-Chinese_Sport_Understanding_ZH-CoT',
'charm-rea-Chinese_Time_Understanding_ZH-CoT',
'charm-rea-Global_Anachronisms_Judgment_ZH-CoT',
'charm-rea-Global_Movie_and_Music_Recommendation_ZH-CoT',
'charm-rea-Global_Natural_Language_Inference_ZH-CoT',
'charm-rea-Global_Reading_Comprehension_ZH-CoT',
'charm-rea-Global_Sequence_Understanding_ZH-CoT',
'charm-rea-Global_Sport_Understanding_ZH-CoT',
'charm-rea-Global_Time_Understanding_ZH-CoT',
'charm-rea-Chinese_Anachronisms_Judgment_EN-CoT',
'charm-rea-Chinese_Movie_and_Music_Recommendation_EN-CoT',
'charm-rea-Chinese_Natural_Language_Inference_EN-CoT',
'charm-rea-Chinese_Reading_Comprehension_EN-CoT',
'charm-rea-Chinese_Sequence_Understanding_EN-CoT',
'charm-rea-Chinese_Sport_Understanding_EN-CoT',
'charm-rea-Chinese_Time_Understanding_EN-CoT',
'charm-rea-Global_Anachronisms_Judgment_EN-CoT',
'charm-rea-Global_Movie_and_Music_Recommendation_EN-CoT',
'charm-rea-Global_Natural_Language_Inference_EN-CoT',
'charm-rea-Global_Reading_Comprehension_EN-CoT',
'charm-rea-Global_Sequence_Understanding_EN-CoT',
'charm-rea-Global_Sport_Understanding_EN-CoT',
'charm-rea-Global_Time_Understanding_EN-CoT',
'charm-rea-Chinese_Anachronisms_Judgment_XLT',
'charm-rea-Chinese_Movie_and_Music_Recommendation_XLT',
'charm-rea-Chinese_Natural_Language_Inference_XLT',
'charm-rea-Chinese_Reading_Comprehension_XLT',
'charm-rea-Chinese_Sequence_Understanding_XLT',
'charm-rea-Chinese_Sport_Understanding_XLT',
'charm-rea-Chinese_Time_Understanding_XLT',
'charm-rea-Global_Anachronisms_Judgment_XLT',
'charm-rea-Global_Movie_and_Music_Recommendation_XLT',
'charm-rea-Global_Natural_Language_Inference_XLT',
'charm-rea-Global_Reading_Comprehension_XLT',
'charm-rea-Global_Sequence_Understanding_XLT',
'charm-rea-Global_Sport_Understanding_XLT',
'charm-rea-Global_Time_Understanding_XLT',
'charm-rea-Chinese_Anachronisms_Judgment_Translate-EN',
'charm-rea-Chinese_Movie_and_Music_Recommendation_Translate-EN',
'charm-rea-Chinese_Natural_Language_Inference_Translate-EN',
'charm-rea-Chinese_Reading_Comprehension_Translate-EN',
'charm-rea-Chinese_Sequence_Understanding_Translate-EN',
'charm-rea-Chinese_Sport_Understanding_Translate-EN',
'charm-rea-Chinese_Time_Understanding_Translate-EN',
'charm-rea-Global_Anachronisms_Judgment_Translate-EN',
'charm-rea-Global_Movie_and_Music_Recommendation_Translate-EN',
'charm-rea-Global_Natural_Language_Inference_Translate-EN',
'charm-rea-Global_Reading_Comprehension_Translate-EN',
'charm-rea-Global_Sequence_Understanding_Translate-EN',
'charm-rea-Global_Sport_Understanding_Translate-EN',
'charm-rea-Global_Time_Understanding_Translate-EN',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
)

View File

@ -0,0 +1,98 @@
from mmengine.config import reasond_base
with reasond_base():
from .groups.charm_reason import charm_reason_summary_groups
summarizer = dict(
dataset_abbrs=[
'charm-reason-Direct',
'charm-reason-ZH-CoT',
'charm-reason-EN-CoT',
'charm-reason-XLT',
'charm-reason-Translate-EN',
'',
'charm-reason-Chinese_Direct',
'charm-reason-Chinese_ZH-CoT',
'charm-reason-Chinese_EN-CoT',
'charm-reason-Chinese_XLT',
'charm-reason-Chinese_Translate-EN',
'charm-reason-Global_Direct',
'charm-reason-Global_ZH-CoT',
'charm-reason-Global_EN-CoT',
'charm-reason-Global_XLT',
'charm-reason-Global_Translate-EN',
'',
'charm-reason-Chinese_Anachronisms_Judgment_Direct',
'charm-reason-Chinese_Movie_and_Music_Recommendation_Direct',
'charm-reason-Chinese_Natural_Language_Inference_Direct',
'charm-reason-Chinese_Reading_Comprehension_Direct',
'charm-reason-Chinese_Sequence_Understanding_Direct',
'charm-reason-Chinese_Sport_Understanding_Direct',
'charm-reason-Chinese_Time_Understanding_Direct',
'charm-reason-Global_Anachronisms_Judgment_Direct',
'charm-reason-Global_Movie_and_Music_Recommendation_Direct',
'charm-reason-Global_Natural_Language_Inference_Direct',
'charm-reason-Global_Reading_Comprehension_Direct',
'charm-reason-Global_Sequence_Understanding_Direct',
'charm-reason-Global_Sport_Understanding_Direct',
'charm-reason-Global_Time_Understanding_Direct',
'charm-reason-Chinese_Anachronisms_Judgment_ZH-CoT',
'charm-reason-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
'charm-reason-Chinese_Natural_Language_Inference_ZH-CoT',
'charm-reason-Chinese_Reading_Comprehension_ZH-CoT',
'charm-reason-Chinese_Sequence_Understanding_ZH-CoT',
'charm-reason-Chinese_Sport_Understanding_ZH-CoT',
'charm-reason-Chinese_Time_Understanding_ZH-CoT',
'charm-reason-Global_Anachronisms_Judgment_ZH-CoT',
'charm-reason-Global_Movie_and_Music_Recommendation_ZH-CoT',
'charm-reason-Global_Natural_Language_Inference_ZH-CoT',
'charm-reason-Global_Reading_Comprehension_ZH-CoT',
'charm-reason-Global_Sequence_Understanding_ZH-CoT',
'charm-reason-Global_Sport_Understanding_ZH-CoT',
'charm-reason-Global_Time_Understanding_ZH-CoT',
'charm-reason-Chinese_Anachronisms_Judgment_EN-CoT',
'charm-reason-Chinese_Movie_and_Music_Recommendation_EN-CoT',
'charm-reason-Chinese_Natural_Language_Inference_EN-CoT',
'charm-reason-Chinese_Reading_Comprehension_EN-CoT',
'charm-reason-Chinese_Sequence_Understanding_EN-CoT',
'charm-reason-Chinese_Sport_Understanding_EN-CoT',
'charm-reason-Chinese_Time_Understanding_EN-CoT',
'charm-reason-Global_Anachronisms_Judgment_EN-CoT',
'charm-reason-Global_Movie_and_Music_Recommendation_EN-CoT',
'charm-reason-Global_Natural_Language_Inference_EN-CoT',
'charm-reason-Global_Reading_Comprehension_EN-CoT',
'charm-reason-Global_Sequence_Understanding_EN-CoT',
'charm-reason-Global_Sport_Understanding_EN-CoT',
'charm-reason-Global_Time_Understanding_EN-CoT',
'charm-reason-Chinese_Anachronisms_Judgment_XLT',
'charm-reason-Chinese_Movie_and_Music_Recommendation_XLT',
'charm-reason-Chinese_Natural_Language_Inference_XLT',
'charm-reason-Chinese_Reading_Comprehension_XLT',
'charm-reason-Chinese_Sequence_Understanding_XLT',
'charm-reason-Chinese_Sport_Understanding_XLT',
'charm-reason-Chinese_Time_Understanding_XLT',
'charm-reason-Global_Anachronisms_Judgment_XLT',
'charm-reason-Global_Movie_and_Music_Recommendation_XLT',
'charm-reason-Global_Natural_Language_Inference_XLT',
'charm-reason-Global_Reading_Comprehension_XLT',
'charm-reason-Global_Sequence_Understanding_XLT',
'charm-reason-Global_Sport_Understanding_XLT',
'charm-reason-Global_Time_Understanding_XLT',
'charm-reason-Chinese_Anachronisms_Judgment_Translate-EN',
'charm-reason-Chinese_Movie_and_Music_Recommendation_Translate-EN',
'charm-reason-Chinese_Natural_Language_Inference_Translate-EN',
'charm-reason-Chinese_Reading_Comprehension_Translate-EN',
'charm-reason-Chinese_Sequence_Understanding_Translate-EN',
'charm-reason-Chinese_Sport_Understanding_Translate-EN',
'charm-reason-Chinese_Time_Understanding_Translate-EN',
'charm-reason-Global_Anachronisms_Judgment_Translate-EN',
'charm-reason-Global_Movie_and_Music_Recommendation_Translate-EN',
'charm-reason-Global_Natural_Language_Inference_Translate-EN',
'charm-reason-Global_Reading_Comprehension_Translate-EN',
'charm-reason-Global_Sequence_Understanding_Translate-EN',
'charm-reason-Global_Sport_Understanding_Translate-EN',
'charm-reason-Global_Time_Understanding_Translate-EN',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
)

View File

@ -122,10 +122,10 @@ IFEval_dataset_abbrs = [
['IFEval', 'Inst-level-loose-accuracy'],
]
summarizer = dict(
type=MultiFacetedSummarizer,
dataset_abbrs_list=[
{'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
{'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
{'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
{'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
@ -143,7 +143,6 @@ summarizer = dict(
{'name': 'humaneval', 'dataset_abbrs': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'GPQA', 'dataset_abbrs': [['GPQA_diamond', 'accuracy']]},
{'name': 'IFEval', 'dataset_abbrs': IFEval_dataset_abbrs},
{'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -1,30 +0,0 @@
charm_tasks = [
'Anachronisms_Judgment',
'Movie_and_Music_Recommendation',
'Natural_Language_Inference',
'Reading_Comprehension',
'Sequence_Understanding',
'Sport_Understanding',
'Time_Understanding',
]
regions = [
'Chinese',
'Global',
]
prompts = [
'Direct',
'ZH-CoT',
'EN-CoT',
'XLT',
'Translate-EN',
]
charm_rea_summary_groups = []
for prompt in prompts:
for region in regions:
subsets = ['charm-rea-' + region + '_' + task + '_' + prompt for task in charm_tasks]
charm_rea_summary_groups.append({'name': 'charm-rea-' + region + '_' + prompt, 'subsets': subsets})
for prompt in prompts:
subsets = ['charm-rea-' + region + '_' + prompt for region in regions]
charm_rea_summary_groups.append({'name': 'charm-rea-' + prompt, 'subsets': subsets})

View File

@ -0,0 +1,35 @@
charm_tasks = [
'Anachronisms_Judgment',
'Movie_and_Music_Recommendation',
'Natural_Language_Inference',
'Reading_Comprehension',
'Sequence_Understanding',
'Sport_Understanding',
'Time_Understanding',
]
regions = [
'Chinese',
'Global',
]
prompts = [
'Direct',
'ZH-CoT',
'EN-CoT',
'XLT',
'Translate-EN',
]
charm_reaso_summary_groups = []
for prompt in prompts:
for region in regions:
subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
for prompt in prompts:
subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
charm_reaso_summary_groups.append(
{'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
)

View File

@ -0,0 +1,26 @@
mathbench_2024_wocircular_summary_groups = [
{'name': 'college', 'subsets': ['college-single_choice_cn', 'college-single_choice_en']},
{'name': 'high', 'subsets': ['high-single_choice_cn', 'high-single_choice_en']},
{'name': 'middle', 'subsets': ['middle-single_choice_cn', 'middle-single_choice_en']},
{'name': 'primary', 'subsets': ['primary-cloze_cn', 'primary-cloze_en']},
{'name': 'cn', 'subsets': ['college-single_choice_cn', 'high-single_choice_cn', 'middle-single_choice_cn', 'primary-cloze_cn']},
{'name': 'en', 'subsets': ['college-single_choice_en', 'high-single_choice_en', 'middle-single_choice_en', 'primary-cloze_en']},
{'name': 'a', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic-cloze_en']},
{'name': 'college_knowledge', 'subsets': ['college_knowledge-single_choice_cn', 'college_knowledge-single_choice_en']},
{'name': 'high_knowledge', 'subsets': ['high_knowledge-single_choice_cn', 'high_knowledge-single_choice_en']},
{'name': 'middle_knowledge', 'subsets': ['middle_knowledge-single_choice_cn', 'middle_knowledge-single_choice_en']},
{'name': 'primary_knowledge', 'subsets': ['primary_knowledge-single_choice_cn', 'primary_knowledge-single_choice_en']},
{'name': 'knowledge-cn', 'subsets': ['college_knowledge-single_choice_cn', 'high_knowledge-single_choice_cn', 'middle_knowledge-single_choice_cn', 'primary_knowledge-single_choice_cn']},
{'name': 'knowledge-en', 'subsets': ['college_knowledge-single_choice_en', 'high_knowledge-single_choice_en', 'middle_knowledge-single_choice_en', 'primary_knowledge-single_choice_en']},
{'name': 't', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
{'name': 'overall', 'subsets': ['a', 't']},
]
for g in mathbench_2024_wocircular_summary_groups:
g['name'] = 'mathbench-wocircular-' + g['name']
g['subsets'] = ['mathbench-wocircular-' + s for s in g['subsets']]
mathbench_2024_summary_groups = mathbench_2024_wocircular_summary_groups

View File

@ -1 +1 @@
__version__ = '0.2.4'
__version__ = '0.2.5'

View File

@ -11,8 +11,8 @@ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
from .base import BaseDataset
@TEXT_POSTPROCESSORS.register_module('charm-rea')
def charm_rea_postprocess(text: str) -> str:
@TEXT_POSTPROCESSORS.register_module('charm-reason')
def charm_reason_postprocess(text: str) -> str:
ans = text
ans_line = ans.split('answer is ')
if len(ans_line) != 1:
@ -27,14 +27,11 @@ def charm_rea_postprocess(text: str) -> str:
@ICL_EVALUATORS.register_module()
class CharmReaEvaluator(BaseEvaluator):
class CharmReasonEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
return {'error': 'preds and refrs have different length'}
details = []
cnt = 0
for pred, ref in zip(predictions, references):
@ -43,9 +40,7 @@ class CharmReaEvaluator(BaseEvaluator):
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score, 'details': details}

View File

@ -208,7 +208,8 @@ class MBPPEvaluator(BaseEvaluator):
assert self.metric in ['MBPP', 'MBPPPlus']
def score(self, predictions, references):
assert len(predictions) == len(references)
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
if self.metric == 'MBPP':
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
@ -286,6 +287,8 @@ class MBPPEvaluator(BaseEvaluator):
r'BEGIN\s*(.*)\s*DONE',
r'```python\s*(.*)\s*```',
r'```\s*(.*)\s*```',
r'```python\s*(.*)\s*$',
r'```\s*(.*)\s*$',
r'(.*)\s*```.*',
r"\[BEGIN\]\s*'(.*)",
r'\[BEGIN\](.*)',

View File

@ -18,7 +18,7 @@ from .huggingface_above_v4_33 import HuggingFacewithChatTemplate # noqa: F401
from .hunyuan_api import Hunyuan # noqa: F401
from .intern_model import InternLM # noqa: F401
from .krgpt_api import KrGPT # noqa: F401
from .lightllm_api import LightllmAPI # noqa: F401
from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
from .lmdeploy_tis import LmdeployTisModel # noqa: F401

View File

@ -84,7 +84,6 @@ def _convert_chat_messages(inputs, merge_role=True):
messages = merged_messages
outputs.append(messages)
print(messages)
return outputs

View File

@ -1,16 +1,20 @@
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union
import numpy as np
import requests
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
from .base import BaseModel
from .base_api import TokenBucket
from .base_api import BaseAPIModel, TokenBucket
PromptType = Union[PromptList, str]
@MODELS.register_module()
@ -189,3 +193,158 @@ class LightllmAPI(BaseModel):
chinese_count = sum(len(part) for part in chinese_parts)
return english_count + chinese_count
class LightllmChatAPI(BaseAPIModel):
"""Model wrapper around YiAPI.
Documentation:
Args:
path (str): The name of YiAPI model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
self.url = url
self.model = path
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
self.flush()
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
if item['role'] != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = item['role']
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
data = {'messages': messages}
max_num_retries = 0
while max_num_retries < self.retry:
self.acquire()
try:
raw_response = requests.request('POST',
url=self.url,
json=data)
except Exception as err:
print('Request Error:{}'.format(err))
time.sleep(2)
continue
try:
response = raw_response.json()
except Exception as err:
print('Response Error:{}'.format(err))
response = None
self.release()
if response is None:
print('Connection error, reconnect.')
# if connect error, frequent requests will casuse
# continuous unstable network, therefore wait here
# to slow down the request
self.wait()
continue
if raw_response.status_code == 200:
# msg = json.load(response.text)
# response
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
if raw_response.status_code == 401:
print('请求被拒绝 api_key错误')
continue
elif raw_response.status_code == 400:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The request was rejected because high risk'
return msg
elif raw_response.status_code == 429:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(5)
continue
else:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(1)
max_num_retries += 1
raise RuntimeError(raw_response)

View File

@ -50,6 +50,9 @@ class OpenAI(BaseAPIModel):
temperature (float, optional): What sampling temperature to use.
If not None, will override the temperature in the `generate()`
call. Defaults to None.
tokenizer_path (str, optional): The path to the tokenizer. Use path if
'tokenizer_path' is None, otherwise use the 'tokenizer_path'.
Defaults to None.
"""
is_api: bool = True
@ -67,7 +70,8 @@ class OpenAI(BaseAPIModel):
mode: str = 'none',
logprobs: Optional[bool] = False,
top_logprobs: Optional[int] = None,
temperature: Optional[float] = None):
temperature: Optional[float] = None,
tokenizer_path: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
@ -82,6 +86,7 @@ class OpenAI(BaseAPIModel):
self.mode = mode
self.logprobs = logprobs
self.top_logprobs = top_logprobs
self.tokenizer_path = tokenizer_path
if isinstance(key, str):
if key == 'ENV':
@ -287,7 +292,8 @@ class OpenAI(BaseAPIModel):
Returns:
int: Length of the input tokens
"""
enc = self.tiktoken.encoding_for_model(self.path)
enc = self.tiktoken.encoding_for_model(self.path
or self.tokenizer_path)
return len(enc.encode(prompt))
def bin_trim(self, prompt: str, num_token: int) -> str:
@ -333,140 +339,3 @@ class OpenAI(BaseAPIModel):
elif self.mode == 'rear':
prompt = sep.join(words[:l])
return prompt
class OpenAIAllesAPIN(OpenAI):
"""Model wrapper around OpenAI-AllesAPIN.
Args:
path (str): The name of OpenAI's model.
url (str): URL to AllesAPIN.
key (str): AllesAPIN key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
is_api: bool = True
def __init__(self,
path: str,
url: str,
key: str,
temperature: float = 1.0,
query_per_second: int = 1,
rpm_verbose: bool = False,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
rpm_verbose=rpm_verbose,
meta_template=meta_template,
retry=retry)
self.url = url
self.temperature = temperature
self.headers = {
'alles-apin-token': key,
'content-type': 'application/json',
}
def _generate(self, input: PromptType, max_out_len: int,
temperature: float) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
# model can be response with user and system
# when it comes with agent involved.
assert msg['role'] in ['user', 'system']
data = {
'model': self.path,
'messages': messages,
'temperature': temperature
}
for _ in range(self.retry):
self.wait()
try:
raw_response = requests.post(self.url,
headers=self.headers,
data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Request error, got',
str(raw_response.content))
time.sleep(1)
continue
try:
response = raw_response.json()
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
time.sleep(1)
continue
if raw_response.status_code == 200 and response[
'msgCode'] == '10000':
data = response['data']
choices = data['choices']
if choices is None:
self.logger.error(data)
else:
return choices[0]['message']['content'].strip()
try:
match = re.match(r'Error code: \d+ - (.*)', response['data'])
err = eval(match.group(1))['error']
if err['code'] == 'content_filter' and err['status'] == 400:
return err['message']
except Exception:
pass
self.logger.error(response['msg'])
self.logger.error(response)
time.sleep(1)
raise RuntimeError('API call failed.')
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized string. Only English and Chinese
characters are counted for now. Users are encouraged to override this
method if more accurate length is needed.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
enc = self.tiktoken.encoding_for_model(self.path)
return len(enc.encode(prompt))

View File

@ -145,6 +145,8 @@ class PPLInferencerOutputHandler:
def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
if 'origin_prompt' not in self.results_dict[str(idx)]:
self.results_dict[str(idx)]['origin_prompt'] = input
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
self.results_dict[str(idx)]['label: ' + str(label)] = {}
self.results_dict[str(idx)]['label: ' +

View File

@ -161,6 +161,9 @@ class DLCRunner(BaseRunner):
shell_cmd += 'umask 0000; '
shell_cmd += '{task_cmd}'
# set priority to 1 as default
task_priority = self.aliyun_cfg.get('priority', 1)
tmpl = (
'dlc submit pytorchjob'
f" --command '{shell_cmd}'"
@ -168,6 +171,7 @@ class DLCRunner(BaseRunner):
f" --config {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
f" --resource_id {self.aliyun_cfg['resource_id']}"
f' --priority {task_priority}'
' --workers 1'
f' --worker_cpu {max(num_gpus * 8, 12)}'
f' --worker_gpu {num_gpus}'

View File

@ -226,7 +226,7 @@ class DefaultSummarizer:
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None):
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None, skip_all_slash=False):
dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}
@ -257,14 +257,16 @@ class DefaultSummarizer:
table.append(header)
for dataset_abbr, metric in summarizer_dataset_abbrs:
if dataset_abbr not in dataset_metrics:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
if not skip_all_slash:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
if metric is None:
metric = dataset_metrics[dataset_abbr][0]
elif metric in dataset_metrics[dataset_abbr]:
pass
else:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
if not skip_all_slash:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]

View File

@ -1,11 +1,9 @@
# flake8: noqa
# yapf: disable
import functools
import getpass
import math
import json
import os
from datetime import datetime
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional
import tabulate
from mmengine import ConfigDict
@ -33,7 +31,9 @@ class MultiFacetedSummarizer(DefaultSummarizer):
profile_dataset_abbrs = dataset_abbrs_item['dataset_abbrs']
# format table
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs)
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs, skip_all_slash=True)
if len(table) == 1:
continue
# output to screen
print(tabulate.tabulate(table, headers='firstrow', floatfmt='.2f'))

View File

@ -214,6 +214,8 @@ class OpenICLEvalTask(BaseTask):
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
preds['test_set'] = test_set
if 'origin_prompt' not in preds:
preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
preds = {
k: preds[k]
for k in signature(icl_evaluator.score).parameters

View File

@ -258,7 +258,7 @@ def change_accelerator(models, accelerator):
path=model['path'],
model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']),
max_out_len=model['max_out_len'],
batch_size=32768,
batch_size=16,
run_cfg=model['run_cfg'],
stop_words=model.get('stop_words', []),
)
@ -272,7 +272,7 @@ def change_accelerator(models, accelerator):
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
max_seq_len=model.get('max_seq_len', 2048),
max_out_len=model['max_out_len'],
batch_size=32768,
batch_size=16,
run_cfg=model['run_cfg'],
stop_words=model.get('stop_words', []),
)

View File

@ -63,15 +63,15 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
# yapf: disable
# flake8: noqa: W605
patterns = [
f'答案是?\s?([{options}])',
f'答案是?\s?([{options}])',
f'答案是?\s?:([{options}])',
f'答案应该?是\s?([{options}])',
f'答案应该?选\s?([{options}])',
f'答案为\s?([{options}])',
f'答案选\s?([{options}])',
f'选择?\s?([{options}])',
f'故选?\s?([{options}])'
f'答案是?\s*([{options}])',
f'答案是?\s*\s*([{options}])',
f'答案是?\s*:\s*([{options}])',
f'答案应该?是\s*([{options}])',
f'答案应该?选\s*([{options}])',
f'答案为\s*([{options}])',
f'答案选\s*([{options}])',
f'选择?\s*([{options}])',
f'故选?\s*([{options}])'
f'只有选?项?\s?([{options}])\s?是?对',
f'只有选?项?\s?([{options}])\s?是?错',
f'只有选?项?\s?([{options}])\s?不?正确',

View File

@ -22,6 +22,7 @@ OpenCC
opencv-python-headless
pandas<2.0.0
prettytable
protobuf
pyext
pypinyin
python-Levenshtein