mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] bump version (#1204)
This commit is contained in:
parent
ba620c4afe
commit
2954913d9b
@ -35,6 +35,7 @@ repos:
|
||||
exclude: |
|
||||
(?x)^(
|
||||
.*\.jsonl|
|
||||
.*\.md.template|
|
||||
configs/
|
||||
)
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
|
@ -1,4 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .charm_rea_gen_f8fca2 import charm_rea_datasets # noqa: F401, F403
|
50
configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
Normal file
50
configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
Normal file
@ -0,0 +1,50 @@
|
||||
import os
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
|
||||
|
||||
with read_base():
|
||||
from .charm_reason_settings import charm_tasks, settings
|
||||
|
||||
settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']]
|
||||
|
||||
charm_reason_datasets = []
|
||||
|
||||
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
|
||||
for _task in charm_tasks:
|
||||
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
|
||||
with open(_fewshot_example_file, 'r') as f:
|
||||
_hint = f.read()
|
||||
|
||||
charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
charm_reason_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
charm_reason_eval_cfg = dict(
|
||||
evaluator=dict(type=CharmReasonEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=charm_reason_postprocess),
|
||||
dataset_postprocessor=dict(type=charm_reason_postprocess),
|
||||
)
|
||||
|
||||
charm_reason_datasets.append(
|
||||
dict(
|
||||
type=CharmDataset,
|
||||
path=dataset_path,
|
||||
name=_task,
|
||||
abbr='charm-reason-' + _task + '_' + _cot,
|
||||
reader_cfg=charm_reason_reader_cfg,
|
||||
infer_cfg=charm_reason_infer_cfg.copy(),
|
||||
eval_cfg=charm_reason_eval_cfg.copy(),
|
||||
)
|
||||
)
|
4
configs/datasets/CHARM/charm_reason_gen.py
Normal file
4
configs/datasets/CHARM/charm_reason_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .charm_reason_gen_f8fca2 import charm_reason_datasets # noqa: F401, F403
|
49
configs/datasets/CHARM/charm_reason_gen_f8fca2.py
Normal file
49
configs/datasets/CHARM/charm_reason_gen_f8fca2.py
Normal file
@ -0,0 +1,49 @@
|
||||
import os
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
|
||||
|
||||
with read_base():
|
||||
from .charm_reason_settings import charm_tasks, settings
|
||||
|
||||
|
||||
charm_reason_datasets = []
|
||||
|
||||
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
|
||||
for _task in charm_tasks:
|
||||
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
|
||||
with open(_fewshot_example_file, 'r') as f:
|
||||
_hint = f.read()
|
||||
|
||||
charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
charm_reason_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
charm_reason_eval_cfg = dict(
|
||||
evaluator=dict(type=CharmReasonEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=charm_reason_postprocess),
|
||||
dataset_postprocessor=dict(type=charm_reason_postprocess),
|
||||
)
|
||||
|
||||
charm_reason_datasets.append(
|
||||
dict(
|
||||
type=CharmDataset,
|
||||
path=dataset_path,
|
||||
name=_task,
|
||||
abbr='charm-reason-' + _task + '_' + _cot,
|
||||
reader_cfg=charm_reason_reader_cfg,
|
||||
infer_cfg=charm_reason_infer_cfg.copy(),
|
||||
eval_cfg=charm_reason_eval_cfg.copy(),
|
||||
)
|
||||
)
|
@ -1,8 +1,4 @@
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CharmDataset, charm_rea_postprocess, CharmReaEvaluator
|
||||
|
||||
charm_tasks = [
|
||||
'Chinese_Anachronisms_Judgment',
|
||||
@ -21,16 +17,16 @@ charm_tasks = [
|
||||
'Global_Time_Understanding',
|
||||
]
|
||||
|
||||
XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
|
||||
Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
|
||||
Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:'
|
||||
|
||||
data_dir = 'data/CHARM'
|
||||
dataset_path_ZH = f'{data_dir}/reasoning'
|
||||
dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN'
|
||||
fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples')
|
||||
fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN')
|
||||
|
||||
XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
|
||||
Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
|
||||
Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:'
|
||||
|
||||
settings = [
|
||||
('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
|
||||
('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
|
||||
@ -38,41 +34,3 @@ settings = [
|
||||
('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template),
|
||||
('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template),
|
||||
]
|
||||
|
||||
charm_rea_datasets = []
|
||||
|
||||
for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
|
||||
for _task in charm_tasks:
|
||||
_fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
|
||||
with open(_fewshot_example_file, 'r') as f:
|
||||
_hint = f.read()
|
||||
|
||||
charm_rea_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
charm_rea_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
charm_rea_eval_cfg = dict(
|
||||
evaluator=dict(type=CharmReaEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=charm_rea_postprocess),
|
||||
dataset_postprocessor=dict(type=charm_rea_postprocess),
|
||||
)
|
||||
|
||||
charm_rea_datasets.append(
|
||||
dict(
|
||||
type=CharmDataset,
|
||||
path=dataset_path,
|
||||
name=_task,
|
||||
abbr='charm-rea-' + _task + '_' + _cot,
|
||||
reader_cfg=charm_rea_reader_cfg,
|
||||
infer_cfg=charm_rea_infer_cfg.copy(),
|
||||
eval_cfg=charm_rea_eval_cfg.copy(),
|
||||
)
|
||||
)
|
186
configs/datasets/GaokaoBench/README.md
Normal file
186
configs/datasets/GaokaoBench/README.md
Normal file
@ -0,0 +1,186 @@
|
||||
# GaokaoBench
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | GaokaoBench |
|
||||
|:------------------------:|--------------:|
|
||||
| llama-7b-turbomind | 14.55 |
|
||||
| llama-13b-turbomind | 16.20 |
|
||||
| llama-30b-turbomind | 16.14 |
|
||||
| llama-65b-turbomind | 13.31 |
|
||||
| llama-2-7b-turbomind | 15.02 |
|
||||
| llama-2-13b-turbomind | 14.86 |
|
||||
| llama-2-70b-turbomind | 16.36 |
|
||||
| llama-3-8b-turbomind | 20.88 |
|
||||
| llama-3-70b-turbomind | 19.98 |
|
||||
| internlm2-1.8b-turbomind | 23.78 |
|
||||
| internlm2-7b-turbomind | 41.41 |
|
||||
| internlm2-20b-turbomind | 58.99 |
|
||||
| qwen-1.8b-turbomind | 22.11 |
|
||||
| qwen-7b-turbomind | 35.32 |
|
||||
| qwen-14b-turbomind | 54.07 |
|
||||
| qwen-72b-turbomind | 77.56 |
|
||||
| qwen1.5-0.5b-hf | 30.67 |
|
||||
| qwen1.5-1.8b-hf | 35.66 |
|
||||
| qwen1.5-4b-hf | 54.31 |
|
||||
| qwen1.5-7b-hf | 65.99 |
|
||||
| qwen1.5-14b-hf | 66.60 |
|
||||
| qwen1.5-32b-hf | 79.01 |
|
||||
| qwen1.5-72b-hf | 80.26 |
|
||||
| qwen1.5-moe-a2-7b-hf | 52.79 |
|
||||
| mistral-7b-v0.1-hf | 14.35 |
|
||||
| mistral-7b-v0.2-hf | 11.10 |
|
||||
| mixtral-8x7b-v0.1-hf | 8.40 |
|
||||
| mixtral-8x22b-v0.1-hf | 16.23 |
|
||||
| yi-6b-hf | 31.70 |
|
||||
| yi-34b-hf | 30.51 |
|
||||
| deepseek-7b-base-hf | 17.02 |
|
||||
| deepseek-67b-base-hf | 10.14 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|
||||
|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
|
||||
| llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 |
|
||||
| llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 |
|
||||
| llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 |
|
||||
| llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 |
|
||||
| llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 |
|
||||
| llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 |
|
||||
| llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 |
|
||||
| llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 |
|
||||
| llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 |
|
||||
| internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 |
|
||||
| internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 |
|
||||
| internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 |
|
||||
| qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 |
|
||||
| qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 |
|
||||
| qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 |
|
||||
| qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 |
|
||||
| qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 |
|
||||
| qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 |
|
||||
| qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 |
|
||||
| qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 |
|
||||
| qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 |
|
||||
| qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 |
|
||||
| qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 |
|
||||
| qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 |
|
||||
| mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 |
|
||||
| mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 |
|
||||
| mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 |
|
||||
| mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 |
|
||||
| yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 |
|
||||
| yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 |
|
||||
| deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 |
|
||||
| deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 |
|
||||
|
||||
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|
||||
|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
|
||||
| llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 |
|
||||
| llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 |
|
||||
| llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 |
|
||||
| llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 |
|
||||
| llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 |
|
||||
| llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 |
|
||||
| llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 |
|
||||
| llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 |
|
||||
| llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 |
|
||||
| internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 |
|
||||
| internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 |
|
||||
| internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 |
|
||||
| qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 |
|
||||
| qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 |
|
||||
| qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 |
|
||||
| qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 |
|
||||
| qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 |
|
||||
| qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 |
|
||||
| qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 |
|
||||
| qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 |
|
||||
| qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 |
|
||||
| qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 |
|
||||
| qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 |
|
||||
| qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 |
|
||||
| mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 |
|
||||
| mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 |
|
||||
| mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 |
|
||||
| mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 |
|
||||
| yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 |
|
||||
| yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 |
|
||||
| deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 |
|
||||
| deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | GaokaoBench |
|
||||
|:-----------------------------:|--------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 21.51 |
|
||||
| qwen1.5-1.8b-chat-hf | 46.19 |
|
||||
| qwen1.5-4b-chat-hf | 59.11 |
|
||||
| qwen1.5-7b-chat-hf | 70.55 |
|
||||
| qwen1.5-14b-chat-hf | 80.39 |
|
||||
| qwen1.5-32b-chat-hf | 86.15 |
|
||||
| qwen1.5-72b-chat-hf | 88.58 |
|
||||
| qwen1.5-110b-chat-hf | 89.59 |
|
||||
| internlm2-chat-1.8b-hf | 29.73 |
|
||||
| internlm2-chat-1.8b-sft-hf | 28.79 |
|
||||
| internlm2-chat-7b-hf | 54.54 |
|
||||
| internlm2-chat-7b-sft-hf | 55.39 |
|
||||
| internlm2-chat-20b-hf | 57.95 |
|
||||
| internlm2-chat-20b-sft-hf | 57.62 |
|
||||
| llama-3-8b-instruct-hf | 45.48 |
|
||||
| llama-3-70b-instruct-hf | 65.91 |
|
||||
| llama-3-8b-instruct-lmdeploy | 44.48 |
|
||||
| llama-3-70b-instruct-lmdeploy | 67.06 |
|
||||
| mistral-7b-instruct-v0.1-hf | 26.21 |
|
||||
| mistral-7b-instruct-v0.2-hf | 32.17 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 42.46 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|
||||
|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 |
|
||||
| qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 |
|
||||
| qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 |
|
||||
| qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 |
|
||||
| qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 |
|
||||
| qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 |
|
||||
| qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 |
|
||||
| qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 |
|
||||
| internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 |
|
||||
| internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 |
|
||||
| internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 |
|
||||
| internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 |
|
||||
| internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 |
|
||||
| internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 |
|
||||
| llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 |
|
||||
| llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 |
|
||||
| llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 |
|
||||
| llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 |
|
||||
| mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 |
|
||||
| mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 |
|
||||
|
||||
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|
||||
|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 |
|
||||
| qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 |
|
||||
| qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 |
|
||||
| qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 |
|
||||
| qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 |
|
||||
| qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 |
|
||||
| qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 |
|
||||
| qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 |
|
||||
| internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 |
|
||||
| internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 |
|
||||
| internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 |
|
||||
| internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 |
|
||||
| internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 |
|
||||
| internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 |
|
||||
| llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 |
|
||||
| llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 |
|
||||
| llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 |
|
||||
| llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 |
|
||||
| mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 |
|
||||
| mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 |
|
@ -1,114 +0,0 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
|
||||
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
|
||||
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
|
||||
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
|
||||
}
|
||||
|
||||
cloze_prompts = {
|
||||
'cloze_cn': [
|
||||
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
|
||||
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
|
||||
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
|
||||
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
|
||||
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
|
||||
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
|
||||
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
|
||||
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
|
||||
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
||||
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
|
||||
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
||||
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
|
||||
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
||||
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
|
||||
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
||||
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}'),
|
||||
],
|
||||
'cloze_en': [
|
||||
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
|
||||
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
|
||||
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
|
||||
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
|
||||
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
|
||||
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
|
||||
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
||||
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
|
||||
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
||||
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
||||
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
||||
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}\n'),
|
||||
]}
|
||||
|
||||
mathbench_sets = {
|
||||
# Practice Part
|
||||
'college': ['single_choice_cn', 'single_choice_en'],
|
||||
'high': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary': ['cloze_cn', 'cloze_en'],
|
||||
'arithmetic': ['cloze_en'],
|
||||
# Theory part
|
||||
'college_knowledge': ['single_choice_cn','single_choice_en'],
|
||||
'high_knowledge': ['single_choice_cn','single_choice_en'],
|
||||
'middle_knowledge': ['single_choice_cn','single_choice_en'],
|
||||
'primary_knowledge': ['single_choice_cn','single_choice_en'],
|
||||
}
|
||||
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = True
|
||||
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
|
||||
mathbench_datasets = []
|
||||
|
||||
for _split in list(mathbench_sets.keys()):
|
||||
for _name in mathbench_sets[_split]:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
|
||||
),
|
||||
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
|
||||
mathbench_eval_cfg = dict(
|
||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'./data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
),
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
))
|
81
configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
Normal file
81
configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
Normal file
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 4
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = False
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = False
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 4
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = False
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = False
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = False
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-wocircular-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -1,124 +0,0 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
|
||||
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
|
||||
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
|
||||
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
|
||||
}
|
||||
|
||||
cloze_prompts = {
|
||||
'cloze_cn': [
|
||||
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
|
||||
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
|
||||
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
|
||||
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
|
||||
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
|
||||
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
|
||||
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
|
||||
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
|
||||
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
||||
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
|
||||
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
||||
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
|
||||
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
||||
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
|
||||
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
||||
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}'),
|
||||
],
|
||||
'cloze_en': [
|
||||
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
|
||||
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
|
||||
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
|
||||
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
|
||||
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
|
||||
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
|
||||
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
||||
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
|
||||
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
||||
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
||||
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
|
||||
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
||||
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}\n'),
|
||||
],
|
||||
}
|
||||
|
||||
mathbench_sets = {
|
||||
# Practice Part
|
||||
'college': ['single_choice_cn', 'single_choice_en'],
|
||||
'high': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary': ['cloze_cn', 'cloze_en'],
|
||||
'calculate': ['cloze_en'],
|
||||
# Theory part
|
||||
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
}
|
||||
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = True
|
||||
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = False
|
||||
|
||||
mathbench_datasets = []
|
||||
|
||||
for _split in list(mathbench_sets.keys()):
|
||||
for _name in mathbench_sets[_split]:
|
||||
mathbench_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
)
|
||||
|
||||
if 'single_choice' in _name:
|
||||
template_round = [
|
||||
dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])),
|
||||
dict(role='BOT', prompt='{answer}')
|
||||
]
|
||||
else:
|
||||
template_round = cloze_prompts[_name]
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
mathbench_eval_cfg = dict(
|
||||
evaluator=evaluator,
|
||||
pred_postprocessor=pred_postprocessor,
|
||||
)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'./data/mathbench_v1_ori/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 4
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = False
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = False
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = True
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-wocircular-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -1,114 +0,0 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
|
||||
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n',
|
||||
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
|
||||
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n',
|
||||
}
|
||||
|
||||
cloze_prompts = {
|
||||
'cloze_cn': [
|
||||
'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6',
|
||||
'Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5',
|
||||
'Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39',
|
||||
'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8',
|
||||
'Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9',
|
||||
'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29',
|
||||
'Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33',
|
||||
'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8',
|
||||
'Q: {question}\nA: {answer}',
|
||||
],
|
||||
'cloze_en': [
|
||||
'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.',
|
||||
'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.',
|
||||
"Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
|
||||
'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.',
|
||||
'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.',
|
||||
'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.',
|
||||
'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.',
|
||||
'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.',
|
||||
'Q: {question}\nA: {answer}',
|
||||
],
|
||||
}
|
||||
|
||||
mathbench_sets = {
|
||||
# Practice Part
|
||||
'college': ['single_choice_cn', 'single_choice_en'],
|
||||
'high': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary': ['cloze_cn', 'cloze_en'],
|
||||
'calculate': ['cloze_en'],
|
||||
# Theory part
|
||||
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
}
|
||||
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = False
|
||||
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = False
|
||||
|
||||
mathbench_datasets = []
|
||||
|
||||
for _split in list(mathbench_sets.keys()):
|
||||
for _name in mathbench_sets[_split]:
|
||||
mathbench_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
)
|
||||
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
mathbench_eval_cfg = dict(
|
||||
evaluator=evaluator,
|
||||
pred_postprocessor=pred_postprocessor,
|
||||
)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'./data/mathbench_v1_ori/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mathbench_2024_gen_19e486 import mathbench_datasets # noqa: F401, F403
|
||||
from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403
|
||||
|
103
configs/datasets/MathBench/mathbench_prompt.py
Normal file
103
configs/datasets/MathBench/mathbench_prompt.py
Normal file
@ -0,0 +1,103 @@
|
||||
zero_shot_prompts = {
|
||||
'single_choice_cn_with_reasoning': [
|
||||
dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:'),
|
||||
],
|
||||
'single_choice_cn': [
|
||||
dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n所以答案是:'),
|
||||
],
|
||||
'single_choice_en_with_reasoning': [
|
||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
|
||||
],
|
||||
'single_choice_en': [
|
||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
||||
],
|
||||
}
|
||||
|
||||
few_shot_prompts = {
|
||||
'single_choice_cn': [
|
||||
dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
|
||||
dict(role='BOT', prompt='回答: D'),
|
||||
dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
|
||||
dict(role='BOT', prompt='回答: D'),
|
||||
dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
|
||||
dict(role='BOT', prompt='回答: A'),
|
||||
dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2,高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \\pi$时,该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
|
||||
dict(role='BOT', prompt='回答: B'),
|
||||
dict(role='HUMAN', prompt='问题: {question}'),
|
||||
dict(role='BOT', prompt='回答: {answer}'),
|
||||
],
|
||||
'single_choice_cn_with_reasoning' : [
|
||||
dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
|
||||
dict(role='BOT', prompt='回答: 因为首先,我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$,以便消去分母中的虚数部分:$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$,这样做的目的是利用复数乘法的性质,从而消去分母中的虚数部分。我们进行乘法运算:$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$,$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$,由于$i^2 = -1$,我们可以进一步简化:$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$,在复平面上,这个点位于第四象限,因为它的实部是正的($\\frac{1}{2}$),而虚部是负的($-\\frac{1}{2}$)\n答案选项:D'),
|
||||
dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
|
||||
dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$(其中$\\omega>0$)的图像向左平移2个单位长度后,得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合,我们需要满足以下条件:\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性,我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$,其中$k$是任意整数。因此,我们可以写出:\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程,我们得到:$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$,我们需要找到满足条件的最小正整数$k$。当$k=0$时,我们得到$\\omega=2$,这是满足条件的最小正值。因此,最小值等于2。答案选项:D'),
|
||||
dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
|
||||
dict(role='BOT', prompt='回答: 我们可以将这个方程写成:$a + 2ai + b = 2i$,然后将实部和虚部分开:\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$,我们可以得到$a = -b$。将这个值代入$2a = 2$,我们得到$2(-b) = 2$,因此$b = -1$。因此,$a = 1$。答案选项:A'),
|
||||
dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4.则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
|
||||
dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们,如果一个三角形的两条边长分别为$a$和$b$,那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中,$a = 3$,$b = 4$,因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项:A'),
|
||||
dict(role='HUMAN', prompt='问题: {question}'),
|
||||
dict(role='BOT', prompt='回答: {answer}'),
|
||||
],
|
||||
'single_choice_en': [
|
||||
dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
|
||||
dict(role='BOT', prompt='Response: D'),
|
||||
dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
|
||||
dict(role='BOT', prompt='Response: D'),
|
||||
dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
|
||||
dict(role='BOT', prompt='Response: A'),
|
||||
dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
|
||||
dict(role='BOT', prompt='Response: B'),
|
||||
dict(role='HUMAN', prompt='Question: {question}'),
|
||||
dict(role='BOT', prompt='Response: {answer}'),
|
||||
],
|
||||
'single_choice_en_with_reasoning': [
|
||||
dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
|
||||
dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
|
||||
dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
|
||||
dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
|
||||
dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
|
||||
dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
|
||||
dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
|
||||
dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
|
||||
dict(role='HUMAN', prompt='Question: {question}'),
|
||||
dict(role='BOT', prompt='Response: {answer}'),
|
||||
],
|
||||
'cloze_cn': [
|
||||
dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
||||
dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9'),
|
||||
dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
||||
dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29'),
|
||||
dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
||||
dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33'),
|
||||
dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
||||
dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8'),
|
||||
dict(role='HUMAN', prompt='问题: {question}'),
|
||||
dict(role='BOT', prompt='回答: {answer}'),
|
||||
],
|
||||
'cloze_en': [
|
||||
dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
||||
dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
|
||||
dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
||||
dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
|
||||
dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
||||
dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
|
||||
dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
||||
dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
|
||||
dict(role='HUMAN', prompt='Question: {question}'),
|
||||
dict(role='BOT', prompt='Response: {answer}'),
|
||||
],
|
||||
}
|
||||
|
||||
mathbench_sets = {
|
||||
# Practice Part
|
||||
'college': ['single_choice_cn', 'single_choice_en'],
|
||||
'high': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary': ['cloze_cn', 'cloze_en'],
|
||||
'arithmetic': ['cloze_en'],
|
||||
# Theory part
|
||||
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
|
||||
}
|
64
configs/datasets/TheoremQA/README.md
Normal file
64
configs/datasets/TheoremQA/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# TheoremQA
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | TheoremQA |
|
||||
|:------------------------:|------------:|
|
||||
| llama-7b-turbomind | 10.25 |
|
||||
| llama-13b-turbomind | 11.25 |
|
||||
| llama-30b-turbomind | 14.25 |
|
||||
| llama-65b-turbomind | 15.62 |
|
||||
| llama-2-7b-turbomind | 12.62 |
|
||||
| llama-2-13b-turbomind | 11.88 |
|
||||
| llama-2-70b-turbomind | 15.62 |
|
||||
| llama-3-8b-turbomind | 20.25 |
|
||||
| llama-3-70b-turbomind | 33.62 |
|
||||
| internlm2-1.8b-turbomind | 10.50 |
|
||||
| internlm2-7b-turbomind | 21.88 |
|
||||
| internlm2-20b-turbomind | 26.00 |
|
||||
| qwen-1.8b-turbomind | 9.38 |
|
||||
| qwen-7b-turbomind | 15.00 |
|
||||
| qwen-14b-turbomind | 21.62 |
|
||||
| qwen-72b-turbomind | 27.12 |
|
||||
| qwen1.5-0.5b-hf | 5.88 |
|
||||
| qwen1.5-1.8b-hf | 12.00 |
|
||||
| qwen1.5-4b-hf | 13.75 |
|
||||
| qwen1.5-7b-hf | 4.25 |
|
||||
| qwen1.5-14b-hf | 12.62 |
|
||||
| qwen1.5-32b-hf | 26.62 |
|
||||
| qwen1.5-72b-hf | 26.62 |
|
||||
| qwen1.5-moe-a2-7b-hf | 7.50 |
|
||||
| mistral-7b-v0.1-hf | 17.00 |
|
||||
| mistral-7b-v0.2-hf | 16.25 |
|
||||
| mixtral-8x7b-v0.1-hf | 24.12 |
|
||||
| mixtral-8x22b-v0.1-hf | 36.75 |
|
||||
| yi-6b-hf | 13.88 |
|
||||
| yi-34b-hf | 24.75 |
|
||||
| deepseek-7b-base-hf | 12.38 |
|
||||
| deepseek-67b-base-hf | 21.25 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | TheoremQA |
|
||||
|:-----------------------------:|------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 9.00 |
|
||||
| qwen1.5-1.8b-chat-hf | 9.25 |
|
||||
| qwen1.5-4b-chat-hf | 13.88 |
|
||||
| qwen1.5-7b-chat-hf | 12.25 |
|
||||
| qwen1.5-14b-chat-hf | 13.63 |
|
||||
| qwen1.5-32b-chat-hf | 19.25 |
|
||||
| qwen1.5-72b-chat-hf | 22.75 |
|
||||
| qwen1.5-110b-chat-hf | 17.50 |
|
||||
| internlm2-chat-1.8b-hf | 13.63 |
|
||||
| internlm2-chat-1.8b-sft-hf | 12.88 |
|
||||
| internlm2-chat-7b-hf | 18.50 |
|
||||
| internlm2-chat-7b-sft-hf | 18.75 |
|
||||
| internlm2-chat-20b-hf | 23.00 |
|
||||
| internlm2-chat-20b-sft-hf | 25.12 |
|
||||
| llama-3-8b-instruct-hf | 19.38 |
|
||||
| llama-3-70b-instruct-hf | 36.25 |
|
||||
| llama-3-8b-instruct-lmdeploy | 19.62 |
|
||||
| llama-3-70b-instruct-lmdeploy | 34.50 |
|
||||
| mistral-7b-instruct-v0.1-hf | 12.62 |
|
||||
| mistral-7b-instruct-v0.2-hf | 11.38 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 26.00 |
|
245
configs/datasets/bbh/README.md
Normal file
245
configs/datasets/bbh/README.md
Normal file
@ -0,0 +1,245 @@
|
||||
# BBH
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | bbh |
|
||||
|:------------------------:|------:|
|
||||
| llama-7b-turbomind | 33.34 |
|
||||
| llama-13b-turbomind | 37.99 |
|
||||
| llama-30b-turbomind | 49.86 |
|
||||
| llama-65b-turbomind | 58.26 |
|
||||
| llama-2-7b-turbomind | 38.27 |
|
||||
| llama-2-13b-turbomind | 45.68 |
|
||||
| llama-2-70b-turbomind | 64.78 |
|
||||
| llama-3-8b-turbomind | 59.69 |
|
||||
| llama-3-70b-turbomind | 79.16 |
|
||||
| internlm2-1.8b-turbomind | 36.03 |
|
||||
| internlm2-7b-turbomind | 63.56 |
|
||||
| internlm2-20b-turbomind | 71.29 |
|
||||
| qwen-1.8b-turbomind | 22.53 |
|
||||
| qwen-7b-turbomind | 45.89 |
|
||||
| qwen-14b-turbomind | 56.75 |
|
||||
| qwen-72b-turbomind | 63.35 |
|
||||
| qwen1.5-0.5b-hf | 20.54 |
|
||||
| qwen1.5-1.8b-hf | 27.01 |
|
||||
| qwen1.5-4b-hf | 34.81 |
|
||||
| qwen1.5-7b-hf | 39.87 |
|
||||
| qwen1.5-14b-hf | 50.38 |
|
||||
| qwen1.5-32b-hf | 67.47 |
|
||||
| qwen1.5-72b-hf | 58.81 |
|
||||
| qwen1.5-moe-a2-7b-hf | 39.46 |
|
||||
| mistral-7b-v0.1-hf | 56.71 |
|
||||
| mistral-7b-v0.2-hf | 57.32 |
|
||||
| mixtral-8x7b-v0.1-hf | 68.46 |
|
||||
| mixtral-8x22b-v0.1-hf | 79.48 |
|
||||
| yi-6b-hf | 44.82 |
|
||||
| yi-34b-hf | 66.37 |
|
||||
| deepseek-7b-base-hf | 42.88 |
|
||||
| deepseek-67b-base-hf | 71.86 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|
||||
|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
|
||||
| llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 |
|
||||
| llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 |
|
||||
| llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 |
|
||||
| llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 |
|
||||
| llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 |
|
||||
| llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 |
|
||||
| llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 |
|
||||
| llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 |
|
||||
| llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 |
|
||||
| internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 |
|
||||
| internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 |
|
||||
| internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 |
|
||||
| qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 |
|
||||
| qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 |
|
||||
| qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 |
|
||||
| qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 |
|
||||
| qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 |
|
||||
| qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 |
|
||||
| qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 |
|
||||
| qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 |
|
||||
| qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 |
|
||||
| qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 |
|
||||
| qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 |
|
||||
| qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 |
|
||||
| mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 |
|
||||
| mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 |
|
||||
| mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 |
|
||||
| mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 |
|
||||
| yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 |
|
||||
| yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 |
|
||||
| deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 |
|
||||
| deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 |
|
||||
|
||||
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|
||||
|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
|
||||
| llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 |
|
||||
| llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 |
|
||||
| llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 |
|
||||
| llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 |
|
||||
| llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 |
|
||||
| llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 |
|
||||
| llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 |
|
||||
| llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 |
|
||||
| llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 |
|
||||
| internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 |
|
||||
| internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 |
|
||||
| internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 |
|
||||
| qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 |
|
||||
| qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 |
|
||||
| qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 |
|
||||
| qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 |
|
||||
| qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 |
|
||||
| qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 |
|
||||
| qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 |
|
||||
| qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 |
|
||||
| qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 |
|
||||
| qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 |
|
||||
| qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 |
|
||||
| qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 |
|
||||
| mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 |
|
||||
| mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 |
|
||||
| mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 |
|
||||
| mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 |
|
||||
| yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 |
|
||||
| yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 |
|
||||
| deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 |
|
||||
| deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 |
|
||||
|
||||
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|
||||
|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
|
||||
| llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 |
|
||||
| llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 |
|
||||
| llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 |
|
||||
| llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 |
|
||||
| llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 |
|
||||
| llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 |
|
||||
| llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 |
|
||||
| llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 |
|
||||
| llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 |
|
||||
| internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 |
|
||||
| internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 |
|
||||
| internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 |
|
||||
| qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 |
|
||||
| qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 |
|
||||
| qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 |
|
||||
| qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 |
|
||||
| qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 |
|
||||
| qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 |
|
||||
| qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 |
|
||||
| qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 |
|
||||
| qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 |
|
||||
| qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 |
|
||||
| qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 |
|
||||
| qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 |
|
||||
| mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 |
|
||||
| mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 |
|
||||
| mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 |
|
||||
| mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 |
|
||||
| yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 |
|
||||
| yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 |
|
||||
| deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 |
|
||||
| deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | bbh |
|
||||
|:-----------------------------:|------:|
|
||||
| qwen1.5-0.5b-chat-hf | 24.12 |
|
||||
| qwen1.5-1.8b-chat-hf | 26.82 |
|
||||
| qwen1.5-4b-chat-hf | 43.15 |
|
||||
| qwen1.5-7b-chat-hf | 38.12 |
|
||||
| qwen1.5-14b-chat-hf | 55.38 |
|
||||
| qwen1.5-32b-chat-hf | 69.28 |
|
||||
| qwen1.5-72b-chat-hf | 72.97 |
|
||||
| qwen1.5-110b-chat-hf | 71.04 |
|
||||
| internlm2-chat-1.8b-hf | 37.69 |
|
||||
| internlm2-chat-1.8b-sft-hf | 37.12 |
|
||||
| internlm2-chat-7b-hf | 57.83 |
|
||||
| internlm2-chat-7b-sft-hf | 57.19 |
|
||||
| internlm2-chat-20b-hf | 68.24 |
|
||||
| internlm2-chat-20b-sft-hf | 69.38 |
|
||||
| llama-3-8b-instruct-hf | 52.85 |
|
||||
| llama-3-70b-instruct-hf | 82.42 |
|
||||
| llama-3-8b-instruct-lmdeploy | 53.54 |
|
||||
| llama-3-70b-instruct-lmdeploy | 82.58 |
|
||||
| mistral-7b-instruct-v0.1-hf | 32.88 |
|
||||
| mistral-7b-instruct-v0.2-hf | 48.84 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|
||||
|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 |
|
||||
| qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 |
|
||||
| qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 |
|
||||
| qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 |
|
||||
| qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 |
|
||||
| qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 |
|
||||
| qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 |
|
||||
| qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 |
|
||||
| internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 |
|
||||
| internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 |
|
||||
| internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 |
|
||||
| internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 |
|
||||
| internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 |
|
||||
| internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 |
|
||||
| llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 |
|
||||
| llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 |
|
||||
| llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 |
|
||||
| llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 |
|
||||
| mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 |
|
||||
| mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 |
|
||||
|
||||
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|
||||
|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 |
|
||||
| qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 |
|
||||
| qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 |
|
||||
| qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 |
|
||||
| qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 |
|
||||
| qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 |
|
||||
| qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 |
|
||||
| qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 |
|
||||
| internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 |
|
||||
| internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 |
|
||||
| internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 |
|
||||
| internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 |
|
||||
| internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 |
|
||||
| internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 |
|
||||
| llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 |
|
||||
| llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 |
|
||||
| llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 |
|
||||
| llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 |
|
||||
| mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 |
|
||||
| mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 |
|
||||
|
||||
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|
||||
|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 |
|
||||
| qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 |
|
||||
| qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 |
|
||||
| qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 |
|
||||
| qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 |
|
||||
| qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 |
|
||||
| qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 |
|
||||
| qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 |
|
||||
| internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 |
|
||||
| internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 |
|
||||
| internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 |
|
||||
| internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 |
|
||||
| internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 |
|
||||
| internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 |
|
||||
| llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 |
|
||||
| llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 |
|
||||
| llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 |
|
||||
| llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 |
|
||||
| mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 |
|
||||
| mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 |
|
367
configs/datasets/ceval/README.md
Normal file
367
configs/datasets/ceval/README.md
Normal file
@ -0,0 +1,367 @@
|
||||
# C-Eval
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
|
||||
|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
|
||||
| llama-7b-turbomind | 26.61 | 27.75 | 27.20 | 26.31 | 25.90 | 26.52 | 27.44 | 27.68 | 27.16 | 29.49 | 24.18 | 29.36 |
|
||||
| llama-13b-turbomind | 29.18 | 25.59 | 27.66 | 33.86 | 28.29 | 28.58 | 31.75 | 30.32 | 31.39 | 35.22 | 30.16 | 30.82 |
|
||||
| llama-30b-turbomind | 35.09 | 31.68 | 34.56 | 39.89 | 33.02 | 33.76 | 37.70 | 31.97 | 34.80 | 42.72 | 41.19 | 34.93 |
|
||||
| llama-65b-turbomind | 37.98 | 29.47 | 36.03 | 45.03 | 36.51 | 36.56 | 40.46 | 33.76 | 36.37 | 46.47 | 42.26 | 40.63 |
|
||||
| llama-2-7b-turbomind | 30.13 | 26.26 | 29.29 | 33.02 | 31.02 | 28.15 | 32.70 | 25.85 | 28.75 | 39.75 | 37.04 | 29.13 |
|
||||
| llama-2-13b-turbomind | 37.38 | 30.81 | 35.85 | 43.98 | 36.81 | 34.75 | 40.43 | 31.34 | 35.67 | 45.75 | 45.32 | 39.36 |
|
||||
| llama-2-70b-turbomind | 49.53 | 33.48 | 44.73 | 60.19 | 50.93 | 47.17 | 50.26 | 32.53 | 44.83 | 59.44 | 54.45 | 47.58 |
|
||||
| llama-3-8b-turbomind | 48.83 | 34.47 | 46.02 | 56.48 | 49.15 | 46.69 | 50.45 | 33.76 | 45.94 | 58.08 | 50.93 | 51.25 |
|
||||
| llama-3-70b-turbomind | 66.56 | 54.09 | 64.08 | 76.43 | 64.38 | 64.25 | 67.30 | 52.35 | 62.67 | 77.89 | 69.76 | 63.65 |
|
||||
| internlm2-1.8b-turbomind | 44.79 | 33.93 | 41.19 | 54.26 | 47.15 | 40.35 | 46.64 | 33.00 | 38.62 | 57.28 | 51.30 | 46.89 |
|
||||
| internlm2-7b-turbomind | 63.54 | 45.32 | 58.10 | 76.40 | 66.94 | 58.32 | 64.23 | 40.09 | 54.37 | 76.88 | 70.11 | 64.77 |
|
||||
| internlm2-20b-turbomind | 67.28 | 50.15 | 62.33 | 79.59 | 70.55 | 61.82 | 66.73 | 42.50 | 59.25 | 79.98 | 73.43 | 61.56 |
|
||||
| qwen-1.8b-turbomind | 54.24 | 38.60 | 50.02 | 68.18 | 55.33 | 48.13 | 53.78 | 33.38 | 46.36 | 68.40 | 57.57 | 50.17 |
|
||||
| qwen-7b-turbomind | 62.06 | 42.73 | 56.21 | 77.12 | 65.28 | 55.76 | 63.23 | 36.99 | 54.74 | 78.55 | 68.94 | 59.02 |
|
||||
| qwen-14b-turbomind | 70.33 | 53.61 | 65.25 | 83.19 | 72.85 | 65.37 | 72.05 | 55.03 | 66.07 | 85.59 | 74.91 | 67.78 |
|
||||
| qwen-72b-turbomind | 83.25 | 66.78 | 78.44 | 91.75 | 83.86 | 83.63 | 83.60 | 63.68 | 78.05 | 90.25 | 87.13 | 84.13 |
|
||||
| qwen1.5-0.5b-hf | 48.36 | 35.55 | 44.72 | 62.00 | 48.51 | 42.41 | 50.43 | 37.00 | 46.28 | 62.64 | 48.11 | 49.18 |
|
||||
| qwen1.5-1.8b-hf | 58.67 | 40.98 | 53.91 | 74.52 | 58.51 | 53.06 | 59.38 | 43.02 | 53.45 | 75.88 | 60.06 | 54.47 |
|
||||
| qwen1.5-4b-hf | 66.55 | 48.50 | 61.45 | 81.12 | 67.90 | 61.22 | 66.46 | 43.12 | 56.76 | 82.89 | 67.61 | 68.03 |
|
||||
| qwen1.5-7b-hf | 72.49 | 52.90 | 66.77 | 85.50 | 74.37 | 69.19 | 73.57 | 49.16 | 66.32 | 84.23 | 77.30 | 73.34 |
|
||||
| qwen1.5-14b-hf | 76.93 | 60.50 | 72.08 | 88.81 | 77.95 | 73.94 | 77.86 | 54.81 | 71.55 | 86.79 | 82.86 | 76.23 |
|
||||
| qwen1.5-32b-hf | 82.50 | 66.67 | 77.97 | 90.93 | 83.66 | 81.88 | 82.79 | 71.06 | 80.01 | 89.02 | 83.36 | 81.62 |
|
||||
| qwen1.5-72b-hf | 83.03 | 65.09 | 77.90 | 91.47 | 83.85 | 83.86 | 83.72 | 64.09 | 77.26 | 91.87 | 87.64 | 84.14 |
|
||||
| qwen1.5-moe-a2-7b-hf | 76.67 | 51.37 | 68.89 | 88.33 | 77.15 | 79.73 | 77.90 | 51.25 | 67.27 | 89.28 | 83.16 | 81.60 |
|
||||
| mistral-7b-v0.1-hf | 43.76 | 33.85 | 42.23 | 49.97 | 41.10 | 43.54 | 47.54 | 33.97 | 44.74 | 54.80 | 51.52 | 42.06 |
|
||||
| mistral-7b-v0.2-hf | 42.81 | 32.84 | 41.00 | 50.19 | 39.45 | 42.77 | 46.44 | 31.67 | 42.89 | 54.50 | 48.75 | 43.23 |
|
||||
| mixtral-8x7b-v0.1-hf | 51.15 | 41.46 | 50.93 | 59.19 | 46.69 | 48.72 | 55.31 | 42.04 | 52.78 | 62.00 | 56.44 | 52.71 |
|
||||
| mixtral-8x22b-v0.1-hf | 58.13 | 48.31 | 58.01 | 66.94 | 53.60 | 54.86 | 60.50 | 45.67 | 57.44 | 71.27 | 61.31 | 55.47 |
|
||||
| yi-6b-hf | 70.78 | 43.72 | 60.54 | 83.29 | 75.39 | 73.40 | 73.13 | 46.87 | 63.14 | 85.52 | 78.70 | 74.45 |
|
||||
| yi-34b-hf | 80.93 | 58.51 | 73.48 | 89.24 | 83.65 | 84.18 | 81.62 | 56.95 | 71.64 | 89.73 | 87.49 | 86.53 |
|
||||
| deepseek-7b-base-hf | 43.68 | 28.90 | 37.03 | 53.55 | 50.14 | 40.34 | 45.07 | 31.94 | 38.81 | 56.68 | 47.10 | 43.85 |
|
||||
| deepseek-67b-base-hf | 66.66 | 44.25 | 57.89 | 79.02 | 72.36 | 65.66 | 66.65 | 38.62 | 56.65 | 79.56 | 73.72 | 66.01 |
|
||||
|
||||
### Details on Test Split
|
||||
|
||||
| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
|
||||
|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
|
||||
| llama-7b-turbomind | 29.82 | 25.70 | 26.94 | 30.99 | 32.95 | 23.66 | 26.01 | 22.89 | 27.45 | 30.09 | 26.48 | 33.13 |
|
||||
| llama-13b-turbomind | 33.33 | 37.99 | 31.09 | 29.82 | 22.16 | 27.23 | 31.79 | 27.11 | 24.84 | 28.02 | 33.33 | 30.72 |
|
||||
| llama-30b-turbomind | 40.94 | 48.60 | 40.41 | 34.21 | 32.95 | 35.71 | 36.42 | 32.53 | 27.45 | 31.56 | 36.07 | 30.12 |
|
||||
| llama-65b-turbomind | 41.52 | 50.84 | 44.04 | 40.94 | 27.84 | 29.46 | 28.32 | 30.72 | 29.41 | 35.10 | 42.47 | 30.12 |
|
||||
| llama-2-7b-turbomind | 33.92 | 37.99 | 34.72 | 30.99 | 26.70 | 21.88 | 31.79 | 25.30 | 24.18 | 31.56 | 39.73 | 30.12 |
|
||||
| llama-2-13b-turbomind | 40.94 | 46.93 | 37.82 | 36.26 | 30.68 | 29.46 | 35.84 | 30.72 | 24.84 | 32.74 | 42.92 | 34.94 |
|
||||
| llama-2-70b-turbomind | 55.56 | 58.66 | 53.89 | 47.95 | 34.09 | 33.48 | 32.95 | 27.11 | 34.64 | 37.76 | 57.99 | 29.52 |
|
||||
| llama-3-8b-turbomind | 55.56 | 58.66 | 55.96 | 51.17 | 27.27 | 35.27 | 36.42 | 31.33 | 34.64 | 40.12 | 50.68 | 30.72 |
|
||||
| llama-3-70b-turbomind | 69.59 | 75.98 | 69.95 | 71.64 | 49.43 | 58.04 | 52.02 | 53.01 | 58.82 | 45.72 | 68.95 | 40.96 |
|
||||
| internlm2-1.8b-turbomind | 40.35 | 40.78 | 39.38 | 32.16 | 34.66 | 34.38 | 31.21 | 31.33 | 35.95 | 35.10 | 51.60 | 27.71 |
|
||||
| internlm2-7b-turbomind | 56.14 | 57.54 | 62.69 | 49.42 | 43.75 | 48.21 | 34.68 | 32.53 | 33.33 | 41.00 | 60.27 | 40.36 |
|
||||
| internlm2-20b-turbomind | 62.57 | 65.36 | 66.84 | 58.77 | 43.18 | 51.79 | 39.31 | 40.36 | 35.95 | 42.77 | 66.67 | 47.59 |
|
||||
| qwen-1.8b-turbomind | 46.20 | 41.90 | 46.63 | 36.84 | 40.34 | 36.61 | 27.75 | 28.92 | 32.68 | 36.58 | 57.08 | 30.12 |
|
||||
| qwen-7b-turbomind | 52.63 | 54.75 | 54.40 | 46.20 | 35.80 | 44.20 | 36.99 | 27.71 | 26.80 | 38.35 | 57.99 | 33.13 |
|
||||
| qwen-14b-turbomind | 58.48 | 64.80 | 59.07 | 54.68 | 45.45 | 57.59 | 45.09 | 33.73 | 39.22 | 49.26 | 67.58 | 45.78 |
|
||||
| qwen-72b-turbomind | 83.04 | 73.74 | 79.27 | 76.61 | 75.00 | 64.29 | 49.13 | 44.58 | 46.41 | 66.37 | 85.84 | 68.07 |
|
||||
| qwen1.5-0.5b-hf | 37.43 | 40.22 | 41.45 | 35.09 | 40.91 | 34.82 | 30.06 | 27.11 | 26.80 | 29.79 | 54.34 | 31.93 |
|
||||
| qwen1.5-1.8b-hf | 47.37 | 50.84 | 47.67 | 38.30 | 43.18 | 35.27 | 29.48 | 30.12 | 33.99 | 39.53 | 58.90 | 28.92 |
|
||||
| qwen1.5-4b-hf | 62.57 | 56.98 | 56.99 | 46.78 | 48.30 | 45.98 | 40.46 | 34.34 | 31.37 | 46.61 | 62.10 | 43.37 |
|
||||
| qwen1.5-7b-hf | 66.08 | 62.57 | 66.32 | 55.56 | 54.55 | 47.77 | 41.62 | 31.93 | 35.95 | 49.85 | 74.43 | 49.40 |
|
||||
| qwen1.5-14b-hf | 71.35 | 66.48 | 68.39 | 64.91 | 57.95 | 65.62 | 41.62 | 40.36 | 47.71 | 56.64 | 79.45 | 56.63 |
|
||||
| qwen1.5-32b-hf | 84.80 | 73.18 | 74.61 | 70.18 | 71.59 | 61.61 | 49.13 | 45.78 | 49.02 | 61.95 | 87.67 | 72.89 |
|
||||
| qwen1.5-72b-hf | 85.38 | 73.74 | 78.24 | 78.36 | 72.73 | 63.39 | 43.35 | 40.96 | 49.02 | 65.78 | 85.84 | 66.27 |
|
||||
| qwen1.5-moe-a2-7b-hf | 77.78 | 73.74 | 68.91 | 64.91 | 66.48 | 49.11 | 33.53 | 36.75 | 35.95 | 61.06 | 91.32 | 40.96 |
|
||||
| mistral-7b-v0.1-hf | 55.56 | 55.31 | 56.99 | 48.25 | 39.77 | 39.29 | 33.53 | 25.90 | 31.37 | 35.99 | 45.21 | 27.11 |
|
||||
| mistral-7b-v0.2-hf | 56.14 | 53.63 | 55.44 | 47.66 | 36.36 | 34.38 | 32.37 | 25.30 | 33.33 | 31.86 | 45.21 | 29.52 |
|
||||
| mixtral-8x7b-v0.1-hf | 62.57 | 64.80 | 60.10 | 60.53 | 38.64 | 42.41 | 40.46 | 37.35 | 45.75 | 35.99 | 60.27 | 34.94 |
|
||||
| mixtral-8x22b-v0.1-hf | 65.50 | 74.86 | 63.73 | 65.79 | 46.59 | 52.68 | 52.02 | 45.78 | 52.94 | 42.77 | 62.56 | 39.16 |
|
||||
| yi-6b-hf | 68.42 | 63.13 | 69.43 | 57.89 | 42.05 | 48.66 | 31.79 | 33.13 | 28.76 | 49.85 | 74.89 | 37.35 |
|
||||
| yi-34b-hf | 83.63 | 80.45 | 74.09 | 68.42 | 62.50 | 60.27 | 45.09 | 38.55 | 50.33 | 65.19 | 88.58 | 49.40 |
|
||||
| deepseek-7b-base-hf | 44.44 | 44.13 | 44.56 | 36.26 | 30.68 | 29.02 | 32.37 | 24.70 | 26.14 | 35.99 | 48.86 | 28.31 |
|
||||
| deepseek-67b-base-hf | 63.16 | 70.39 | 65.80 | 59.36 | 42.61 | 45.54 | 35.84 | 38.55 | 42.48 | 44.54 | 68.95 | 33.73 |
|
||||
|
||||
| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
|
||||
|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
|
||||
| llama-7b-turbomind | 29.14 | 26.74 | 24.57 | 29.94 | 22.92 | 23.60 | 20.00 | 30.95 | 29.98 | 24.58 | 25.70 | 25.11 |
|
||||
| llama-13b-turbomind | 22.29 | 18.60 | 28.00 | 26.55 | 26.56 | 25.28 | 19.46 | 29.05 | 28.77 | 28.57 | 39.66 | 43.38 |
|
||||
| llama-30b-turbomind | 25.14 | 33.14 | 36.00 | 31.07 | 39.06 | 28.09 | 33.51 | 38.10 | 35.21 | 35.88 | 48.04 | 33.33 |
|
||||
| llama-65b-turbomind | 33.71 | 26.16 | 38.29 | 33.90 | 44.27 | 36.52 | 38.92 | 38.10 | 37.42 | 42.19 | 59.22 | 48.40 |
|
||||
| llama-2-7b-turbomind | 26.86 | 23.26 | 26.86 | 28.81 | 28.12 | 29.78 | 22.70 | 30.48 | 31.79 | 30.56 | 33.52 | 36.07 |
|
||||
| llama-2-13b-turbomind | 28.00 | 31.98 | 36.57 | 36.72 | 38.54 | 36.52 | 37.84 | 46.67 | 37.02 | 36.54 | 57.54 | 41.10 |
|
||||
| llama-2-70b-turbomind | 40.00 | 36.05 | 48.00 | 36.72 | 66.67 | 55.06 | 55.68 | 52.86 | 51.91 | 48.50 | 68.16 | 60.73 |
|
||||
| llama-3-8b-turbomind | 41.71 | 38.37 | 50.86 | 36.16 | 61.98 | 63.48 | 63.78 | 56.19 | 41.65 | 49.17 | 69.27 | 54.34 |
|
||||
| llama-3-70b-turbomind | 63.43 | 56.98 | 69.14 | 59.32 | 84.90 | 75.28 | 78.92 | 79.52 | 68.81 | 59.80 | 86.59 | 79.91 |
|
||||
| internlm2-1.8b-turbomind | 30.29 | 45.93 | 46.29 | 33.33 | 63.02 | 60.11 | 62.70 | 47.62 | 35.61 | 37.87 | 69.27 | 61.64 |
|
||||
| internlm2-7b-turbomind | 64.57 | 65.12 | 76.00 | 54.80 | 91.15 | 85.96 | 90.27 | 74.29 | 57.34 | 50.50 | 86.59 | 83.56 |
|
||||
| internlm2-20b-turbomind | 68.57 | 74.42 | 78.86 | 58.76 | 91.67 | 90.45 | 90.27 | 72.38 | 57.95 | 55.81 | 88.83 | 88.58 |
|
||||
| qwen-1.8b-turbomind | 55.43 | 56.98 | 61.14 | 54.80 | 85.42 | 84.83 | 85.41 | 54.76 | 43.06 | 44.19 | 83.80 | 79.91 |
|
||||
| qwen-7b-turbomind | 68.00 | 69.19 | 82.86 | 57.63 | 93.75 | 87.64 | 92.43 | 63.81 | 47.28 | 57.48 | 86.59 | 82.65 |
|
||||
| qwen-14b-turbomind | 78.86 | 83.14 | 92.57 | 67.23 | 96.88 | 95.51 | 96.76 | 73.33 | 56.94 | 64.45 | 91.62 | 86.76 |
|
||||
| qwen-72b-turbomind | 93.14 | 93.60 | 95.43 | 88.70 | 98.44 | 97.75 | 99.46 | 90.00 | 75.45 | 80.73 | 96.09 | 99.54 |
|
||||
| qwen1.5-0.5b-hf | 48.57 | 44.19 | 60.00 | 40.68 | 73.44 | 69.66 | 78.92 | 49.05 | 34.41 | 40.20 | 79.89 | 74.43 |
|
||||
| qwen1.5-1.8b-hf | 58.86 | 68.02 | 76.00 | 59.32 | 91.15 | 90.45 | 87.03 | 63.81 | 44.87 | 48.50 | 86.03 | 90.41 |
|
||||
| qwen1.5-4b-hf | 66.86 | 77.33 | 82.86 | 68.93 | 95.31 | 92.70 | 97.30 | 71.90 | 51.31 | 61.13 | 91.62 | 94.52 |
|
||||
| qwen1.5-7b-hf | 79.43 | 82.56 | 91.43 | 77.40 | 96.88 | 95.51 | 96.22 | 80.00 | 62.37 | 69.77 | 93.30 | 97.26 |
|
||||
| qwen1.5-14b-hf | 86.29 | 87.79 | 93.14 | 83.05 | 97.92 | 95.51 | 97.84 | 82.86 | 63.78 | 77.08 | 95.53 | 96.35 |
|
||||
| qwen1.5-32b-hf | 88.00 | 95.35 | 94.86 | 91.53 | 97.92 | 99.44 | 100.00 | 90.00 | 73.44 | 78.74 | 94.97 | 98.63 |
|
||||
| qwen1.5-72b-hf | 91.43 | 93.60 | 95.43 | 88.70 | 97.92 | 98.31 | 99.46 | 90.00 | 74.25 | 80.40 | 94.41 | 98.63 |
|
||||
| qwen1.5-moe-a2-7b-hf | 70.86 | 77.33 | 82.86 | 68.36 | 97.92 | 93.26 | 97.30 | 89.52 | 70.22 | 74.75 | 96.09 | 98.17 |
|
||||
| mistral-7b-v0.1-hf | 33.14 | 40.70 | 40.57 | 40.11 | 47.92 | 49.44 | 50.81 | 47.62 | 44.87 | 37.87 | 58.10 | 48.40 |
|
||||
| mistral-7b-v0.2-hf | 34.86 | 36.63 | 45.71 | 36.72 | 46.35 | 46.07 | 48.65 | 43.81 | 43.46 | 39.53 | 57.54 | 48.86 |
|
||||
| mixtral-8x7b-v0.1-hf | 49.71 | 42.44 | 53.71 | 47.46 | 62.50 | 61.24 | 60.00 | 57.62 | 52.52 | 44.52 | 68.72 | 57.99 |
|
||||
| mixtral-8x22b-v0.1-hf | 54.29 | 43.02 | 58.29 | 55.93 | 76.04 | 66.29 | 75.68 | 66.19 | 60.97 | 51.83 | 74.30 | 70.78 |
|
||||
| yi-6b-hf | 58.86 | 69.19 | 78.29 | 43.50 | 92.19 | 89.33 | 90.27 | 83.81 | 59.56 | 70.10 | 93.85 | 97.72 |
|
||||
| yi-34b-hf | 80.00 | 81.98 | 93.14 | 65.54 | 97.40 | 95.51 | 96.76 | 92.86 | 74.04 | 76.08 | 94.97 | 97.26 |
|
||||
| deepseek-7b-base-hf | 29.14 | 30.81 | 33.14 | 24.29 | 53.12 | 45.51 | 48.65 | 50.48 | 38.23 | 44.19 | 62.01 | 65.30 |
|
||||
| deepseek-67b-base-hf | 60.00 | 55.23 | 64.00 | 46.33 | 84.90 | 79.78 | 83.24 | 73.33 | 57.75 | 63.79 | 89.94 | 88.58 |
|
||||
|
||||
| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
|
||||
|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
|
||||
| llama-7b-turbomind | 22.96 | 31.58 | 25.57 | 29.78 | 22.80 | 25.00 | 21.70 | 21.51 | 25.00 | 26.24 | 22.49 | 25.84 |
|
||||
| llama-13b-turbomind | 29.26 | 30.83 | 33.52 | 36.52 | 34.72 | 33.33 | 24.06 | 40.12 | 26.47 | 33.48 | 30.14 | 29.87 |
|
||||
| llama-30b-turbomind | 37.41 | 46.37 | 32.95 | 38.20 | 50.78 | 40.74 | 28.77 | 45.93 | 33.33 | 32.13 | 39.23 | 22.82 |
|
||||
| llama-65b-turbomind | 39.63 | 51.13 | 31.82 | 39.89 | 58.03 | 42.59 | 34.91 | 55.23 | 39.71 | 30.32 | 37.80 | 32.89 |
|
||||
| llama-2-7b-turbomind | 27.78 | 34.34 | 31.82 | 34.83 | 35.23 | 34.26 | 28.77 | 38.95 | 32.35 | 33.94 | 27.27 | 30.87 |
|
||||
| llama-2-13b-turbomind | 41.48 | 47.37 | 37.50 | 37.64 | 50.78 | 52.78 | 43.40 | 48.84 | 32.35 | 38.46 | 36.36 | 30.20 |
|
||||
| llama-2-70b-turbomind | 57.78 | 69.17 | 50.57 | 58.43 | 69.95 | 66.67 | 50.94 | 72.09 | 50.98 | 42.53 | 44.98 | 52.01 |
|
||||
| llama-3-8b-turbomind | 56.30 | 65.41 | 47.16 | 56.18 | 64.25 | 61.11 | 55.66 | 67.44 | 41.67 | 40.27 | 45.45 | 50.34 |
|
||||
| llama-3-70b-turbomind | 72.22 | 85.46 | 75.00 | 74.72 | 84.97 | 76.85 | 75.00 | 76.16 | 59.31 | 52.94 | 62.68 | 68.46 |
|
||||
| internlm2-1.8b-turbomind | 47.41 | 61.40 | 55.11 | 47.75 | 61.66 | 64.81 | 61.79 | 63.95 | 32.35 | 32.58 | 48.33 | 36.58 |
|
||||
| internlm2-7b-turbomind | 66.67 | 85.96 | 78.98 | 74.72 | 91.71 | 87.96 | 80.66 | 80.23 | 42.16 | 50.23 | 64.11 | 70.13 |
|
||||
| internlm2-20b-turbomind | 69.26 | 89.22 | 83.52 | 80.34 | 90.67 | 91.67 | 83.02 | 85.47 | 49.02 | 54.30 | 72.25 | 73.15 |
|
||||
| qwen-1.8b-turbomind | 51.11 | 70.68 | 71.02 | 62.36 | 88.60 | 87.04 | 69.81 | 73.26 | 29.90 | 46.15 | 50.24 | 47.32 |
|
||||
| qwen-7b-turbomind | 57.41 | 83.71 | 88.64 | 79.78 | 93.26 | 94.44 | 75.47 | 79.07 | 42.16 | 47.96 | 59.33 | 65.10 |
|
||||
| qwen-14b-turbomind | 72.96 | 89.97 | 93.75 | 83.71 | 96.37 | 95.37 | 86.32 | 87.21 | 50.00 | 60.63 | 66.99 | 72.48 |
|
||||
| qwen-72b-turbomind | 85.56 | 96.24 | 95.45 | 93.26 | 97.93 | 97.22 | 92.45 | 91.86 | 67.65 | 76.92 | 75.12 | 83.89 |
|
||||
| qwen1.5-0.5b-hf | 43.33 | 63.16 | 65.91 | 56.18 | 82.90 | 79.63 | 68.87 | 70.35 | 28.43 | 37.56 | 39.23 | 32.21 |
|
||||
| qwen1.5-1.8b-hf | 57.41 | 76.44 | 81.25 | 75.84 | 92.75 | 91.67 | 79.72 | 81.98 | 34.31 | 47.96 | 47.85 | 43.62 |
|
||||
| qwen1.5-4b-hf | 65.93 | 87.47 | 86.93 | 82.58 | 94.30 | 95.37 | 84.91 | 84.30 | 40.20 | 62.90 | 58.85 | 58.72 |
|
||||
| qwen1.5-7b-hf | 69.26 | 91.98 | 90.91 | 89.89 | 95.85 | 94.44 | 89.15 | 87.21 | 48.04 | 67.87 | 63.16 | 68.12 |
|
||||
| qwen1.5-14b-hf | 78.89 | 94.99 | 94.89 | 91.57 | 96.89 | 98.15 | 91.04 | 88.37 | 57.84 | 69.68 | 66.99 | 73.83 |
|
||||
| qwen1.5-32b-hf | 83.70 | 95.99 | 93.75 | 94.38 | 98.45 | 97.22 | 90.57 | 91.28 | 70.10 | 76.92 | 76.56 | 80.87 |
|
||||
| qwen1.5-72b-hf | 84.44 | 96.49 | 96.59 | 93.82 | 98.45 | 97.22 | 92.92 | 91.28 | 66.67 | 76.92 | 74.16 | 85.23 |
|
||||
| qwen1.5-moe-a2-7b-hf | 80.74 | 95.49 | 89.20 | 89.33 | 94.82 | 94.44 | 92.45 | 91.28 | 52.45 | 75.57 | 67.94 | 79.87 |
|
||||
| mistral-7b-v0.1-hf | 45.19 | 59.15 | 43.75 | 49.44 | 56.48 | 56.48 | 45.28 | 58.14 | 37.75 | 38.91 | 40.67 | 34.56 |
|
||||
| mistral-7b-v0.2-hf | 45.93 | 58.65 | 38.07 | 48.31 | 63.21 | 58.33 | 41.98 | 54.07 | 35.78 | 40.27 | 38.28 | 32.21 |
|
||||
| mixtral-8x7b-v0.1-hf | 57.04 | 67.92 | 53.41 | 55.06 | 69.95 | 64.81 | 47.64 | 70.93 | 42.16 | 38.01 | 46.41 | 36.58 |
|
||||
| mixtral-8x22b-v0.1-hf | 60.37 | 72.68 | 64.77 | 65.17 | 77.20 | 71.30 | 57.08 | 75.00 | 49.51 | 43.44 | 52.63 | 49.33 |
|
||||
| yi-6b-hf | 79.26 | 92.48 | 77.27 | 76.40 | 92.75 | 93.52 | 89.15 | 90.12 | 60.78 | 74.66 | 61.24 | 74.16 |
|
||||
| yi-34b-hf | 84.81 | 96.24 | 88.07 | 88.20 | 96.37 | 96.30 | 91.98 | 91.28 | 75.00 | 78.73 | 80.38 | 82.89 |
|
||||
| deepseek-7b-base-hf | 52.22 | 70.18 | 47.16 | 51.12 | 60.62 | 44.44 | 58.49 | 66.86 | 31.86 | 37.56 | 53.11 | 61.07 |
|
||||
| deepseek-67b-base-hf | 76.67 | 89.22 | 77.27 | 78.65 | 89.64 | 78.70 | 85.85 | 84.30 | 50.00 | 64.25 | 69.38 | 84.23 |
|
||||
|
||||
| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
|
||||
|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
|
||||
| llama-7b-turbomind | 29.70 | 23.72 | 27.53 | 30.22 | 30.92 | 27.04 | 22.78 | 28.64 | 28.00 | 25.00 | 26.32 | 29.80 |
|
||||
| llama-13b-turbomind | 25.94 | 20.93 | 25.84 | 29.67 | 24.64 | 29.60 | 26.67 | 29.15 | 33.71 | 25.50 | 28.47 | 28.44 |
|
||||
| llama-30b-turbomind | 29.32 | 27.91 | 30.34 | 36.26 | 37.20 | 36.13 | 36.11 | 38.69 | 34.29 | 29.50 | 38.52 | 29.35 |
|
||||
| llama-65b-turbomind | 28.95 | 30.70 | 30.90 | 44.51 | 35.75 | 36.60 | 45.56 | 39.20 | 37.71 | 30.00 | 39.47 | 37.02 |
|
||||
| llama-2-7b-turbomind | 29.70 | 30.23 | 24.72 | 29.67 | 34.78 | 30.07 | 31.11 | 31.16 | 30.29 | 25.50 | 31.34 | 27.31 |
|
||||
| llama-2-13b-turbomind | 30.83 | 32.56 | 24.16 | 42.31 | 45.41 | 32.87 | 36.67 | 45.23 | 38.29 | 33.50 | 35.17 | 34.31 |
|
||||
| llama-2-70b-turbomind | 53.76 | 38.14 | 30.34 | 58.79 | 65.70 | 43.82 | 51.11 | 58.29 | 49.71 | 42.00 | 49.76 | 46.28 |
|
||||
| llama-3-8b-turbomind | 52.63 | 42.33 | 27.53 | 51.65 | 65.70 | 44.52 | 54.44 | 51.26 | 46.86 | 43.00 | 46.41 | 45.15 |
|
||||
| llama-3-70b-turbomind | 72.93 | 52.56 | 32.58 | 71.98 | 83.57 | 56.88 | 69.44 | 78.89 | 76.00 | 67.50 | 57.89 | 59.14 |
|
||||
| internlm2-1.8b-turbomind | 51.50 | 38.14 | 25.84 | 56.04 | 71.50 | 47.32 | 35.00 | 43.72 | 42.29 | 39.00 | 41.15 | 36.57 |
|
||||
| internlm2-7b-turbomind | 72.56 | 53.49 | 52.25 | 79.67 | 90.82 | 62.00 | 62.78 | 64.32 | 66.86 | 59.50 | 55.74 | 53.50 |
|
||||
| internlm2-20b-turbomind | 74.06 | 54.42 | 56.18 | 81.87 | 92.27 | 61.77 | 68.33 | 69.85 | 68.00 | 63.50 | 60.77 | 58.92 |
|
||||
| qwen-1.8b-turbomind | 54.14 | 43.72 | 39.89 | 69.23 | 85.02 | 49.88 | 45.56 | 48.74 | 48.57 | 51.50 | 46.89 | 45.82 |
|
||||
| qwen-7b-turbomind | 71.05 | 48.37 | 53.93 | 81.87 | 93.72 | 59.67 | 54.44 | 62.31 | 58.29 | 57.50 | 50.24 | 56.66 |
|
||||
| qwen-14b-turbomind | 79.70 | 53.02 | 63.48 | 87.36 | 94.20 | 71.33 | 63.33 | 71.36 | 73.14 | 68.00 | 59.09 | 67.95 |
|
||||
| qwen-72b-turbomind | 90.23 | 77.21 | 79.21 | 91.76 | 96.14 | 77.86 | 86.11 | 85.43 | 91.43 | 90.50 | 76.08 | 86.68 |
|
||||
| qwen1.5-0.5b-hf | 44.36 | 36.74 | 39.33 | 58.24 | 78.26 | 43.36 | 40.00 | 45.23 | 41.71 | 42.50 | 43.54 | 43.12 |
|
||||
| qwen1.5-1.8b-hf | 59.40 | 47.91 | 37.08 | 72.53 | 91.30 | 53.61 | 53.33 | 51.26 | 49.71 | 58.00 | 51.20 | 56.21 |
|
||||
| qwen1.5-4b-hf | 65.04 | 58.60 | 55.62 | 83.52 | 94.20 | 62.00 | 63.89 | 65.33 | 65.71 | 64.00 | 55.26 | 61.40 |
|
||||
| qwen1.5-7b-hf | 78.57 | 66.51 | 66.85 | 87.91 | 94.69 | 68.07 | 65.00 | 64.82 | 77.14 | 77.50 | 60.77 | 74.49 |
|
||||
| qwen1.5-14b-hf | 83.08 | 72.09 | 70.22 | 90.11 | 94.20 | 69.46 | 73.89 | 70.35 | 82.29 | 83.00 | 65.31 | 78.33 |
|
||||
| qwen1.5-32b-hf | 87.59 | 78.14 | 79.78 | 92.86 | 95.65 | 78.32 | 80.56 | 79.90 | 90.29 | 89.00 | 77.27 | 86.68 |
|
||||
| qwen1.5-72b-hf | 91.35 | 76.74 | 79.21 | 91.76 | 96.14 | 79.25 | 85.56 | 86.93 | 92.00 | 90.00 | 75.84 | 86.91 |
|
||||
| qwen1.5-moe-a2-7b-hf | 88.35 | 75.81 | 51.12 | 79.12 | 94.69 | 67.37 | 80.56 | 73.37 | 87.43 | 84.00 | 78.23 | 82.39 |
|
||||
| mistral-7b-v0.1-hf | 40.23 | 39.07 | 24.16 | 41.21 | 52.17 | 41.49 | 45.00 | 52.26 | 45.14 | 42.00 | 42.58 | 44.02 |
|
||||
| mistral-7b-v0.2-hf | 36.84 | 34.88 | 23.03 | 43.96 | 52.66 | 40.79 | 50.00 | 50.75 | 45.14 | 40.50 | 42.58 | 40.86 |
|
||||
| mixtral-8x7b-v0.1-hf | 47.74 | 40.00 | 28.09 | 57.14 | 58.94 | 44.29 | 58.33 | 53.77 | 48.57 | 46.00 | 51.20 | 46.50 |
|
||||
| mixtral-8x22b-v0.1-hf | 59.02 | 41.86 | 29.78 | 60.99 | 71.01 | 50.82 | 57.78 | 67.34 | 62.29 | 52.00 | 53.35 | 55.98 |
|
||||
| yi-6b-hf | 85.34 | 67.91 | 53.93 | 80.22 | 91.79 | 65.97 | 72.22 | 72.36 | 82.29 | 84.50 | 69.86 | 71.56 |
|
||||
| yi-34b-hf | 94.36 | 76.74 | 65.73 | 87.91 | 95.17 | 79.25 | 85.56 | 90.95 | 90.86 | 92.00 | 76.79 | 82.39 |
|
||||
| deepseek-7b-base-hf | 65.79 | 29.30 | 32.58 | 47.80 | 67.15 | 37.76 | 44.44 | 52.26 | 43.43 | 36.50 | 41.15 | 37.02 |
|
||||
| deepseek-67b-base-hf | 83.83 | 58.60 | 45.51 | 79.67 | 90.34 | 62.47 | 70.56 | 70.85 | 81.14 | 71.50 | 61.72 | 60.05 |
|
||||
|
||||
| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
|
||||
|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
|
||||
| llama-7b-turbomind | 22.34 | 24.91 | 29.12 | 27.77 |
|
||||
| llama-13b-turbomind | 24.11 | 30.25 | 27.77 | 30.70 |
|
||||
| llama-30b-turbomind | 28.72 | 31.67 | 31.83 | 36.57 |
|
||||
| llama-65b-turbomind | 28.37 | 39.15 | 33.63 | 35.44 |
|
||||
| llama-2-7b-turbomind | 22.70 | 24.91 | 25.51 | 29.80 |
|
||||
| llama-2-13b-turbomind | 25.53 | 35.94 | 29.35 | 35.44 |
|
||||
| llama-2-70b-turbomind | 36.52 | 52.67 | 36.12 | 52.60 |
|
||||
| llama-3-8b-turbomind | 35.46 | 49.82 | 41.31 | 55.30 |
|
||||
| llama-3-70b-turbomind | 48.58 | 64.41 | 52.60 | 75.40 |
|
||||
| internlm2-1.8b-turbomind | 32.27 | 42.35 | 39.05 | 45.15 |
|
||||
| internlm2-7b-turbomind | 46.81 | 55.16 | 47.63 | 67.27 |
|
||||
| internlm2-20b-turbomind | 45.04 | 62.63 | 51.47 | 69.75 |
|
||||
| qwen-1.8b-turbomind | 41.84 | 47.69 | 45.60 | 57.34 |
|
||||
| qwen-7b-turbomind | 41.84 | 54.80 | 48.08 | 69.53 |
|
||||
| qwen-14b-turbomind | 45.74 | 64.77 | 56.43 | 77.88 |
|
||||
| qwen-72b-turbomind | 80.50 | 74.73 | 81.04 | 89.62 |
|
||||
| qwen1.5-0.5b-hf | 39.36 | 41.28 | 38.37 | 48.08 |
|
||||
| qwen1.5-1.8b-hf | 45.74 | 49.47 | 51.69 | 63.43 |
|
||||
| qwen1.5-4b-hf | 50.35 | 51.60 | 58.69 | 75.17 |
|
||||
| qwen1.5-7b-hf | 58.51 | 65.84 | 67.04 | 81.94 |
|
||||
| qwen1.5-14b-hf | 63.83 | 67.26 | 72.23 | 87.36 |
|
||||
| qwen1.5-32b-hf | 74.47 | 73.31 | 80.14 | 90.74 |
|
||||
| qwen1.5-72b-hf | 79.79 | 75.09 | 81.04 | 90.07 |
|
||||
| qwen1.5-moe-a2-7b-hf | 74.82 | 77.58 | 79.68 | 91.65 |
|
||||
| mistral-7b-v0.1-hf | 32.27 | 45.91 | 37.70 | 50.56 |
|
||||
| mistral-7b-v0.2-hf | 32.62 | 44.13 | 36.79 | 46.28 |
|
||||
| mixtral-8x7b-v0.1-hf | 35.11 | 53.02 | 46.73 | 52.37 |
|
||||
| mixtral-8x22b-v0.1-hf | 38.65 | 56.23 | 49.21 | 59.82 |
|
||||
| yi-6b-hf | 67.38 | 68.68 | 69.53 | 83.07 |
|
||||
| yi-34b-hf | 77.66 | 83.27 | 77.43 | 89.84 |
|
||||
| deepseek-7b-base-hf | 30.50 | 38.79 | 35.67 | 46.28 |
|
||||
| deepseek-67b-base-hf | 46.81 | 65.12 | 54.40 | 77.65 |
|
||||
|
||||
### Details on Dev Split
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
|
||||
|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 36.88 | 28.83 | 34.49 | 43.46 | 37.35 | 34.76 | 38.58 | 33.90 | 33.63 | 43.81 | 41.79 | 39.59 |
|
||||
| qwen1.5-1.8b-chat-hf | 55.17 | 38.21 | 50.63 | 70.26 | 56.04 | 48.82 | 55.93 | 37.60 | 50.31 | 67.59 | 60.90 | 50.59 |
|
||||
| qwen1.5-4b-chat-hf | 61.54 | 44.79 | 56.86 | 75.84 | 62.13 | 56.46 | 62.76 | 38.32 | 55.39 | 79.53 | 65.67 | 58.00 |
|
||||
| qwen1.5-7b-chat-hf | 68.71 | 51.77 | 64.27 | 81.23 | 68.22 | 65.88 | 71.10 | 50.13 | 65.42 | 83.99 | 73.77 | 67.02 |
|
||||
| qwen1.5-14b-chat-hf | 74.80 | 56.54 | 69.46 | 87.47 | 76.46 | 71.32 | 76.35 | 52.08 | 69.68 | 86.70 | 80.56 | 74.87 |
|
||||
| qwen1.5-32b-chat-hf | 80.47 | 63.17 | 75.66 | 89.58 | 81.98 | 79.43 | 81.27 | 63.51 | 76.64 | 89.39 | 82.97 | 80.59 |
|
||||
| qwen1.5-72b-chat-hf | 81.53 | 63.62 | 75.86 | 90.74 | 83.18 | 81.84 | 82.88 | 62.44 | 77.54 | 89.80 | 86.11 | 83.07 |
|
||||
| qwen1.5-110b-chat-hf | 87.33 | 67.27 | 80.70 | 93.58 | 89.67 | 91.35 | 87.59 | 73.64 | 81.94 | 91.47 | 92.12 | 89.80 |
|
||||
| internlm2-chat-1.8b-hf | 47.04 | 34.81 | 43.28 | 59.34 | 48.24 | 41.50 | 48.51 | 36.75 | 42.23 | 57.79 | 54.83 | 45.15 |
|
||||
| internlm2-chat-1.8b-sft-hf | 47.19 | 35.34 | 43.49 | 59.56 | 48.30 | 41.58 | 48.75 | 35.83 | 42.04 | 59.80 | 54.84 | 44.83 |
|
||||
| internlm2-chat-7b-hf | 58.75 | 39.61 | 52.38 | 71.46 | 61.57 | 55.96 | 61.04 | 36.56 | 51.81 | 74.01 | 69.13 | 57.92 |
|
||||
| internlm2-chat-7b-sft-hf | 58.96 | 40.09 | 52.40 | 71.49 | 62.20 | 56.26 | 61.02 | 37.29 | 52.60 | 74.01 | 68.27 | 57.27 |
|
||||
| internlm2-chat-20b-hf | 63.12 | 42.65 | 56.21 | 75.64 | 67.15 | 60.27 | 63.45 | 34.96 | 52.84 | 79.27 | 71.50 | 60.32 |
|
||||
| internlm2-chat-20b-sft-hf | 63.16 | 42.70 | 56.19 | 75.74 | 67.20 | 60.37 | 63.54 | 34.96 | 52.57 | 80.33 | 71.42 | 60.34 |
|
||||
| llama-3-8b-instruct-hf | 50.90 | 34.54 | 46.73 | 58.73 | 49.24 | 53.04 | 52.55 | 36.37 | 48.47 | 58.03 | 53.26 | 54.26 |
|
||||
| llama-3-70b-instruct-hf | 67.38 | 54.02 | 65.16 | 76.83 | 62.29 | 67.92 | 67.92 | 54.50 | 66.85 | 76.80 | 65.98 | 63.72 |
|
||||
| llama-3-8b-instruct-lmdeploy | 49.92 | 34.75 | 46.19 | 58.49 | 47.68 | 51.14 | 50.27 | 33.32 | 46.25 | 56.93 | 49.02 | 52.76 |
|
||||
| llama-3-70b-instruct-lmdeploy | 66.41 | 52.76 | 64.72 | 75.31 | 61.36 | 66.44 | 68.21 | 52.28 | 65.86 | 75.06 | 68.37 | 66.09 |
|
||||
| mistral-7b-instruct-v0.1-hf | 36.76 | 27.76 | 35.55 | 42.41 | 34.45 | 36.12 | 40.04 | 30.21 | 35.77 | 45.15 | 40.99 | 42.22 |
|
||||
| mistral-7b-instruct-v0.2-hf | 40.38 | 30.26 | 38.82 | 47.66 | 37.08 | 39.91 | 43.00 | 25.97 | 38.60 | 47.44 | 48.15 | 41.82 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 49.61 | 37.78 | 47.86 | 58.56 | 46.40 | 47.85 | 51.68 | 37.41 | 49.14 | 59.79 | 52.97 | 47.65 |
|
||||
|
||||
### Details on Test Split
|
||||
|
||||
| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
|
||||
|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 35.67 | 36.87 | 33.68 | 33.92 | 35.23 | 28.12 | 27.17 | 26.51 | 24.84 | 28.91 | 40.18 | 25.90 |
|
||||
| qwen1.5-1.8b-chat-hf | 46.78 | 47.49 | 50.78 | 39.18 | 41.48 | 31.25 | 32.95 | 27.71 | 28.10 | 34.81 | 55.71 | 27.11 |
|
||||
| qwen1.5-4b-chat-hf | 54.39 | 54.75 | 54.92 | 44.74 | 46.02 | 43.30 | 39.31 | 31.33 | 28.10 | 45.13 | 58.90 | 43.98 |
|
||||
| qwen1.5-7b-chat-hf | 60.82 | 60.34 | 63.21 | 55.85 | 48.86 | 45.09 | 46.24 | 36.14 | 39.22 | 47.49 | 70.32 | 45.78 |
|
||||
| qwen1.5-14b-chat-hf | 69.59 | 62.57 | 64.77 | 64.91 | 55.68 | 57.14 | 49.13 | 32.53 | 43.14 | 55.16 | 76.71 | 46.99 |
|
||||
| qwen1.5-32b-chat-hf | 81.87 | 74.30 | 73.58 | 71.35 | 63.07 | 60.71 | 50.87 | 46.99 | 47.06 | 59.29 | 83.11 | 60.84 |
|
||||
| qwen1.5-72b-chat-hf | 77.78 | 75.42 | 76.17 | 73.39 | 63.64 | 62.50 | 45.09 | 45.78 | 48.37 | 59.00 | 81.74 | 60.84 |
|
||||
| qwen1.5-110b-chat-hf | 83.63 | 86.03 | 81.87 | 77.49 | 76.70 | 67.86 | 49.13 | 47.59 | 55.56 | 79.94 | 95.89 | 62.05 |
|
||||
| internlm2-chat-1.8b-hf | 42.11 | 43.58 | 44.56 | 35.38 | 32.95 | 34.82 | 32.95 | 28.92 | 32.68 | 34.22 | 53.42 | 31.93 |
|
||||
| internlm2-chat-1.8b-sft-hf | 42.11 | 44.13 | 43.01 | 35.09 | 34.09 | 36.16 | 32.95 | 27.11 | 33.33 | 35.10 | 51.14 | 33.13 |
|
||||
| internlm2-chat-7b-hf | 59.65 | 60.89 | 58.03 | 51.46 | 36.93 | 43.75 | 36.99 | 29.52 | 36.60 | 39.82 | 63.47 | 38.55 |
|
||||
| internlm2-chat-7b-sft-hf | 59.06 | 61.45 | 56.48 | 52.63 | 39.77 | 41.52 | 36.99 | 27.71 | 39.22 | 40.12 | 62.10 | 40.36 |
|
||||
| internlm2-chat-20b-hf | 61.99 | 70.39 | 63.73 | 54.97 | 33.52 | 47.77 | 43.93 | 40.96 | 44.44 | 44.25 | 61.64 | 34.34 |
|
||||
| internlm2-chat-20b-sft-hf | 61.40 | 70.39 | 63.21 | 54.97 | 32.95 | 47.77 | 42.20 | 42.17 | 43.14 | 44.25 | 61.64 | 32.53 |
|
||||
| llama-3-8b-instruct-hf | 57.31 | 58.10 | 57.51 | 51.17 | 28.41 | 35.27 | 39.31 | 32.53 | 35.29 | 38.05 | 55.25 | 27.11 |
|
||||
| llama-3-70b-instruct-hf | 71.93 | 74.86 | 70.98 | 67.54 | 50.57 | 57.14 | 52.60 | 53.01 | 56.21 | 47.79 | 68.95 | 43.98 |
|
||||
| llama-3-8b-instruct-lmdeploy | 55.56 | 57.54 | 55.44 | 48.25 | 30.11 | 33.04 | 35.84 | 31.33 | 33.33 | 38.94 | 53.88 | 31.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 70.76 | 77.09 | 69.95 | 67.84 | 49.43 | 54.02 | 50.87 | 54.22 | 56.21 | 47.20 | 69.86 | 42.17 |
|
||||
| mistral-7b-instruct-v0.1-hf | 49.12 | 47.49 | 43.52 | 39.18 | 32.39 | 28.57 | 29.48 | 24.10 | 28.10 | 37.46 | 44.29 | 23.49 |
|
||||
| mistral-7b-instruct-v0.2-hf | 47.95 | 53.07 | 52.85 | 42.69 | 28.41 | 26.79 | 40.46 | 30.12 | 29.41 | 33.33 | 42.92 | 24.10 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 58.48 | 62.57 | 58.03 | 56.43 | 38.64 | 36.16 | 39.31 | 34.94 | 37.91 | 34.81 | 55.71 | 28.31 |
|
||||
|
||||
| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
|
||||
|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 30.86 | 31.98 | 44.00 | 27.68 | 47.40 | 40.45 | 55.14 | 35.24 | 32.80 | 30.56 | 58.66 | 57.53 |
|
||||
| qwen1.5-1.8b-chat-hf | 54.86 | 62.21 | 69.14 | 53.67 | 82.81 | 83.15 | 85.41 | 58.10 | 44.06 | 49.83 | 82.12 | 82.65 |
|
||||
| qwen1.5-4b-chat-hf | 58.86 | 67.44 | 80.00 | 55.93 | 89.58 | 88.20 | 88.11 | 64.29 | 47.08 | 57.48 | 86.59 | 84.93 |
|
||||
| qwen1.5-7b-chat-hf | 72.00 | 80.81 | 84.00 | 70.06 | 95.31 | 94.94 | 95.14 | 73.81 | 56.94 | 66.11 | 91.62 | 89.04 |
|
||||
| qwen1.5-14b-chat-hf | 84.00 | 83.72 | 90.29 | 80.23 | 97.92 | 94.94 | 98.38 | 81.43 | 63.18 | 74.75 | 93.30 | 96.80 |
|
||||
| qwen1.5-32b-chat-hf | 85.71 | 90.12 | 93.71 | 85.31 | 97.92 | 98.31 | 100.00 | 89.05 | 69.82 | 75.75 | 93.85 | 97.72 |
|
||||
| qwen1.5-72b-chat-hf | 88.57 | 94.19 | 94.86 | 85.31 | 97.92 | 97.75 | 98.38 | 90.48 | 71.63 | 79.73 | 93.85 | 97.72 |
|
||||
| qwen1.5-110b-chat-hf | 86.86 | 92.44 | 94.29 | 85.31 | 98.44 | 98.88 | 98.92 | 95.24 | 78.87 | 86.38 | 95.53 | 99.54 |
|
||||
| internlm2-chat-1.8b-hf | 35.43 | 48.84 | 52.00 | 35.03 | 70.31 | 67.98 | 67.03 | 41.43 | 37.83 | 36.88 | 70.95 | 60.73 |
|
||||
| internlm2-chat-1.8b-sft-hf | 37.71 | 48.26 | 53.14 | 34.46 | 71.35 | 67.98 | 67.57 | 41.90 | 38.63 | 37.54 | 72.63 | 60.27 |
|
||||
| internlm2-chat-7b-hf | 46.29 | 48.26 | 60.57 | 46.89 | 78.65 | 71.91 | 71.35 | 68.10 | 50.30 | 50.83 | 77.09 | 76.26 |
|
||||
| internlm2-chat-7b-sft-hf | 46.86 | 48.26 | 61.14 | 45.76 | 77.60 | 71.91 | 71.35 | 67.62 | 50.10 | 50.50 | 77.09 | 75.80 |
|
||||
| internlm2-chat-20b-hf | 49.71 | 46.51 | 63.43 | 55.37 | 80.73 | 74.72 | 79.46 | 72.38 | 55.73 | 59.80 | 85.47 | 76.26 |
|
||||
| internlm2-chat-20b-sft-hf | 53.71 | 47.09 | 64.00 | 55.37 | 80.73 | 73.60 | 78.92 | 73.81 | 55.53 | 60.13 | 85.47 | 75.80 |
|
||||
| llama-3-8b-instruct-hf | 38.86 | 39.53 | 50.29 | 40.11 | 65.10 | 60.11 | 63.78 | 61.43 | 47.89 | 45.85 | 69.27 | 56.16 |
|
||||
| llama-3-70b-instruct-hf | 63.43 | 55.23 | 69.71 | 68.36 | 85.42 | 80.90 | 78.38 | 86.19 | 69.01 | 65.12 | 83.24 | 82.65 |
|
||||
| llama-3-8b-instruct-lmdeploy | 41.71 | 40.70 | 52.00 | 41.24 | 61.46 | 58.43 | 65.41 | 57.62 | 45.27 | 46.18 | 69.27 | 55.71 |
|
||||
| llama-3-70b-instruct-lmdeploy | 61.71 | 53.49 | 70.86 | 64.97 | 88.02 | 83.71 | 77.30 | 84.76 | 68.21 | 60.80 | 80.45 | 79.91 |
|
||||
| mistral-7b-instruct-v0.1-hf | 27.43 | 28.49 | 36.00 | 28.25 | 40.10 | 42.70 | 43.78 | 37.14 | 32.80 | 37.87 | 41.90 | 48.86 |
|
||||
| mistral-7b-instruct-v0.2-hf | 33.14 | 29.65 | 44.00 | 31.07 | 47.92 | 44.94 | 49.19 | 44.29 | 37.02 | 40.86 | 53.63 | 48.40 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 46.29 | 40.70 | 54.86 | 42.37 | 58.85 | 60.67 | 57.84 | 54.29 | 50.10 | 46.51 | 69.27 | 52.51 |
|
||||
|
||||
| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
|
||||
|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 33.33 | 46.12 | 37.50 | 37.08 | 57.51 | 43.52 | 42.45 | 51.74 | 32.84 | 31.22 | 37.32 | 24.50 |
|
||||
| qwen1.5-1.8b-chat-hf | 54.07 | 72.43 | 74.43 | 66.85 | 89.12 | 87.04 | 77.36 | 76.16 | 38.24 | 44.34 | 46.89 | 40.94 |
|
||||
| qwen1.5-4b-chat-hf | 60.00 | 84.71 | 82.39 | 69.66 | 94.82 | 90.74 | 79.72 | 78.49 | 41.67 | 57.47 | 54.07 | 56.38 |
|
||||
| qwen1.5-7b-chat-hf | 66.30 | 90.73 | 84.66 | 80.90 | 94.30 | 91.67 | 82.55 | 84.88 | 38.73 | 60.18 | 60.77 | 63.42 |
|
||||
| qwen1.5-14b-chat-hf | 74.81 | 93.73 | 90.91 | 92.13 | 96.89 | 98.15 | 89.62 | 88.37 | 54.41 | 70.14 | 69.86 | 69.13 |
|
||||
| qwen1.5-32b-chat-hf | 80.37 | 94.49 | 93.75 | 94.94 | 97.93 | 97.22 | 90.09 | 90.70 | 68.63 | 78.73 | 73.21 | 77.52 |
|
||||
| qwen1.5-72b-chat-hf | 84.07 | 96.74 | 95.45 | 94.94 | 97.93 | 95.37 | 92.92 | 91.28 | 63.73 | 80.09 | 73.68 | 83.89 |
|
||||
| qwen1.5-110b-chat-hf | 90.37 | 96.99 | 96.02 | 95.51 | 98.45 | 98.15 | 93.87 | 94.19 | 81.37 | 86.88 | 84.69 | 90.94 |
|
||||
| internlm2-chat-1.8b-hf | 48.15 | 65.41 | 69.32 | 54.49 | 79.27 | 70.37 | 60.85 | 64.53 | 32.35 | 32.58 | 45.45 | 40.60 |
|
||||
| internlm2-chat-1.8b-sft-hf | 48.15 | 64.91 | 69.89 | 53.93 | 79.27 | 70.37 | 61.32 | 63.95 | 33.82 | 29.86 | 45.45 | 39.93 |
|
||||
| internlm2-chat-7b-hf | 66.67 | 85.21 | 73.30 | 66.85 | 91.19 | 76.85 | 70.28 | 75.58 | 42.16 | 50.68 | 60.77 | 70.47 |
|
||||
| internlm2-chat-7b-sft-hf | 67.04 | 85.21 | 73.86 | 66.85 | 90.67 | 77.78 | 71.70 | 75.00 | 42.16 | 51.13 | 60.29 | 72.15 |
|
||||
| internlm2-chat-20b-hf | 74.07 | 85.96 | 75.57 | 77.53 | 89.12 | 76.85 | 72.64 | 83.72 | 51.96 | 56.11 | 68.42 | 73.49 |
|
||||
| internlm2-chat-20b-sft-hf | 73.70 | 85.46 | 76.70 | 78.09 | 89.64 | 76.85 | 72.17 | 84.88 | 50.00 | 56.56 | 66.99 | 75.17 |
|
||||
| llama-3-8b-instruct-hf | 55.93 | 67.42 | 55.68 | 55.06 | 72.02 | 62.04 | 54.25 | 66.86 | 44.12 | 40.72 | 47.37 | 44.63 |
|
||||
| llama-3-70b-instruct-hf | 71.11 | 84.21 | 74.43 | 73.03 | 84.97 | 80.56 | 69.81 | 78.49 | 57.35 | 50.68 | 57.89 | 64.43 |
|
||||
| llama-3-8b-instruct-lmdeploy | 54.81 | 67.17 | 58.52 | 53.37 | 72.54 | 62.04 | 57.08 | 63.95 | 44.12 | 37.56 | 46.89 | 42.62 |
|
||||
| llama-3-70b-instruct-lmdeploy | 70.37 | 82.96 | 72.16 | 71.91 | 83.94 | 82.41 | 69.34 | 77.91 | 55.39 | 50.68 | 56.46 | 64.09 |
|
||||
| mistral-7b-instruct-v0.1-hf | 39.63 | 46.62 | 33.52 | 41.01 | 56.48 | 45.37 | 36.32 | 43.60 | 29.90 | 31.67 | 39.71 | 31.88 |
|
||||
| mistral-7b-instruct-v0.2-hf | 46.30 | 54.39 | 39.20 | 43.26 | 61.66 | 51.85 | 35.38 | 55.23 | 28.92 | 35.29 | 37.80 | 29.19 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 58.52 | 66.17 | 56.82 | 57.30 | 66.32 | 62.04 | 48.11 | 66.28 | 41.67 | 37.10 | 46.41 | 35.91 |
|
||||
|
||||
| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
|
||||
|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 36.47 | 39.07 | 27.53 | 41.76 | 45.89 | 39.63 | 35.56 | 31.66 | 37.71 | 34.00 | 32.78 | 37.25 |
|
||||
| qwen1.5-1.8b-chat-hf | 56.02 | 45.58 | 39.33 | 67.03 | 84.54 | 49.42 | 48.89 | 51.76 | 47.43 | 50.50 | 45.69 | 52.14 |
|
||||
| qwen1.5-4b-chat-hf | 61.28 | 52.56 | 42.70 | 73.08 | 85.99 | 55.48 | 59.44 | 55.28 | 60.57 | 57.00 | 50.00 | 58.01 |
|
||||
| qwen1.5-7b-chat-hf | 73.31 | 56.28 | 58.99 | 82.97 | 88.41 | 64.57 | 66.67 | 63.82 | 77.14 | 75.50 | 57.42 | 69.07 |
|
||||
| qwen1.5-14b-chat-hf | 80.83 | 65.12 | 70.79 | 89.56 | 93.24 | 67.60 | 72.78 | 68.34 | 80.57 | 80.00 | 61.72 | 75.62 |
|
||||
| qwen1.5-32b-chat-hf | 87.59 | 72.56 | 76.40 | 90.66 | 95.65 | 74.36 | 80.00 | 80.40 | 86.86 | 84.00 | 74.88 | 85.33 |
|
||||
| qwen1.5-72b-chat-hf | 90.98 | 76.28 | 75.84 | 90.66 | 95.65 | 75.52 | 84.44 | 82.91 | 91.43 | 89.00 | 73.92 | 85.10 |
|
||||
| qwen1.5-110b-chat-hf | 95.11 | 88.37 | 82.58 | 91.76 | 96.62 | 87.65 | 91.67 | 90.95 | 93.71 | 95.00 | 87.08 | 91.87 |
|
||||
| internlm2-chat-1.8b-hf | 54.14 | 40.00 | 27.53 | 62.09 | 70.53 | 44.99 | 41.67 | 51.76 | 45.71 | 39.00 | 40.67 | 39.28 |
|
||||
| internlm2-chat-1.8b-sft-hf | 54.14 | 42.33 | 26.97 | 61.54 | 71.98 | 45.45 | 41.67 | 50.25 | 45.14 | 37.50 | 41.39 | 40.63 |
|
||||
| internlm2-chat-7b-hf | 70.68 | 44.19 | 34.83 | 73.63 | 84.06 | 51.98 | 57.22 | 68.34 | 66.86 | 57.50 | 54.55 | 50.11 |
|
||||
| internlm2-chat-7b-sft-hf | 71.80 | 44.65 | 37.64 | 73.63 | 84.06 | 51.98 | 57.78 | 67.84 | 65.71 | 60.50 | 54.55 | 50.11 |
|
||||
| internlm2-chat-20b-hf | 75.56 | 54.42 | 42.13 | 74.73 | 85.51 | 57.34 | 65.56 | 67.84 | 73.71 | 64.00 | 57.89 | 55.98 |
|
||||
| internlm2-chat-20b-sft-hf | 76.32 | 55.35 | 41.01 | 75.27 | 85.51 | 58.28 | 65.56 | 67.34 | 72.57 | 65.00 | 58.37 | 56.43 |
|
||||
| llama-3-8b-instruct-hf | 53.01 | 44.65 | 33.15 | 46.70 | 66.18 | 45.22 | 58.89 | 61.81 | 62.86 | 57.50 | 48.33 | 49.89 |
|
||||
| llama-3-70b-instruct-hf | 71.43 | 50.70 | 30.90 | 71.43 | 82.13 | 59.67 | 73.33 | 73.37 | 82.86 | 82.00 | 59.09 | 62.08 |
|
||||
| llama-3-8b-instruct-lmdeploy | 51.13 | 45.12 | 29.78 | 43.96 | 62.32 | 47.09 | 56.11 | 54.77 | 56.00 | 56.00 | 49.04 | 47.40 |
|
||||
| llama-3-70b-instruct-lmdeploy | 68.80 | 48.84 | 30.90 | 70.88 | 81.64 | 58.28 | 72.22 | 70.85 | 80.00 | 81.00 | 57.66 | 62.53 |
|
||||
| mistral-7b-instruct-v0.1-hf | 30.45 | 35.81 | 24.72 | 40.11 | 34.78 | 30.77 | 43.89 | 38.69 | 36.57 | 32.50 | 44.74 | 34.09 |
|
||||
| mistral-7b-instruct-v0.2-hf | 36.09 | 38.14 | 23.03 | 43.41 | 45.41 | 35.90 | 50.00 | 41.71 | 42.86 | 36.00 | 45.22 | 42.21 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 47.37 | 44.65 | 30.34 | 51.65 | 60.87 | 42.19 | 53.89 | 58.29 | 52.00 | 47.00 | 48.56 | 44.02 |
|
||||
|
||||
| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
|
||||
|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 27.66 | 38.43 | 32.28 | 35.44 |
|
||||
| qwen1.5-1.8b-chat-hf | 38.65 | 46.62 | 46.73 | 59.14 |
|
||||
| qwen1.5-4b-chat-hf | 49.29 | 54.80 | 51.02 | 70.20 |
|
||||
| qwen1.5-7b-chat-hf | 53.90 | 62.28 | 57.79 | 76.52 |
|
||||
| qwen1.5-14b-chat-hf | 58.87 | 65.12 | 67.27 | 86.68 |
|
||||
| qwen1.5-32b-chat-hf | 74.11 | 70.82 | 74.94 | 88.04 |
|
||||
| qwen1.5-72b-chat-hf | 74.82 | 75.09 | 78.56 | 89.39 |
|
||||
| qwen1.5-110b-chat-hf | 88.30 | 88.97 | 94.13 | 95.49 |
|
||||
| internlm2-chat-1.8b-hf | 30.14 | 41.99 | 34.54 | 46.73 |
|
||||
| internlm2-chat-1.8b-sft-hf | 30.14 | 43.06 | 34.31 | 47.86 |
|
||||
| internlm2-chat-7b-hf | 42.20 | 52.31 | 47.63 | 66.82 |
|
||||
| internlm2-chat-7b-sft-hf | 43.26 | 52.67 | 47.86 | 66.59 |
|
||||
| internlm2-chat-20b-hf | 45.74 | 54.80 | 51.02 | 69.07 |
|
||||
| internlm2-chat-20b-sft-hf | 45.74 | 55.16 | 51.02 | 68.62 |
|
||||
| llama-3-8b-instruct-hf | 37.59 | 50.53 | 42.44 | 68.40 |
|
||||
| llama-3-70b-instruct-hf | 50.71 | 64.06 | 55.53 | 84.42 |
|
||||
| llama-3-8b-instruct-lmdeploy | 37.94 | 50.53 | 41.53 | 66.14 |
|
||||
| llama-3-70b-instruct-lmdeploy | 48.94 | 63.70 | 53.95 | 81.72 |
|
||||
| mistral-7b-instruct-v0.1-hf | 27.66 | 39.15 | 29.35 | 39.95 |
|
||||
| mistral-7b-instruct-v0.2-hf | 32.27 | 37.01 | 32.96 | 42.89 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 36.88 | 48.75 | 41.76 | 53.05 |
|
||||
|
||||
### Details on Dev Split
|
64
configs/datasets/gpqa/README.md
Normal file
64
configs/datasets/gpqa/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# GPQA
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | GPQA_diamond |
|
||||
|:------------------------:|---------------:|
|
||||
| llama-7b-turbomind | 24.24 |
|
||||
| llama-13b-turbomind | 25.25 |
|
||||
| llama-30b-turbomind | 22.73 |
|
||||
| llama-65b-turbomind | 21.72 |
|
||||
| llama-2-7b-turbomind | 25.25 |
|
||||
| llama-2-13b-turbomind | 23.74 |
|
||||
| llama-2-70b-turbomind | 28.28 |
|
||||
| llama-3-8b-turbomind | 31.82 |
|
||||
| llama-3-70b-turbomind | 40.91 |
|
||||
| internlm2-1.8b-turbomind | 24.24 |
|
||||
| internlm2-7b-turbomind | 28.28 |
|
||||
| internlm2-20b-turbomind | 31.31 |
|
||||
| qwen-1.8b-turbomind | 28.79 |
|
||||
| qwen-7b-turbomind | 24.75 |
|
||||
| qwen-14b-turbomind | 27.78 |
|
||||
| qwen-72b-turbomind | 31.31 |
|
||||
| qwen1.5-0.5b-hf | 23.74 |
|
||||
| qwen1.5-1.8b-hf | 28.79 |
|
||||
| qwen1.5-4b-hf | 23.23 |
|
||||
| qwen1.5-7b-hf | 20.71 |
|
||||
| qwen1.5-14b-hf | 32.32 |
|
||||
| qwen1.5-32b-hf | 30.81 |
|
||||
| qwen1.5-72b-hf | 31.82 |
|
||||
| qwen1.5-moe-a2-7b-hf | 28.79 |
|
||||
| mistral-7b-v0.1-hf | 24.75 |
|
||||
| mistral-7b-v0.2-hf | 23.74 |
|
||||
| mixtral-8x7b-v0.1-hf | 28.79 |
|
||||
| mixtral-8x22b-v0.1-hf | 36.36 |
|
||||
| yi-6b-hf | 28.28 |
|
||||
| yi-34b-hf | 35.86 |
|
||||
| deepseek-7b-base-hf | 20.71 |
|
||||
| deepseek-67b-base-hf | 25.25 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | GPQA_diamond |
|
||||
|:-----------------------------:|---------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 19.70 |
|
||||
| qwen1.5-1.8b-chat-hf | 29.80 |
|
||||
| qwen1.5-4b-chat-hf | 25.25 |
|
||||
| qwen1.5-7b-chat-hf | 31.82 |
|
||||
| qwen1.5-14b-chat-hf | 30.30 |
|
||||
| qwen1.5-32b-chat-hf | 31.31 |
|
||||
| qwen1.5-72b-chat-hf | 32.83 |
|
||||
| qwen1.5-110b-chat-hf | 35.86 |
|
||||
| internlm2-chat-1.8b-hf | 25.76 |
|
||||
| internlm2-chat-1.8b-sft-hf | 26.26 |
|
||||
| internlm2-chat-7b-hf | 28.28 |
|
||||
| internlm2-chat-7b-sft-hf | 27.27 |
|
||||
| internlm2-chat-20b-hf | 30.30 |
|
||||
| internlm2-chat-20b-sft-hf | 29.29 |
|
||||
| llama-3-8b-instruct-hf | 25.76 |
|
||||
| llama-3-70b-instruct-hf | 37.88 |
|
||||
| llama-3-8b-instruct-lmdeploy | 25.76 |
|
||||
| llama-3-70b-instruct-lmdeploy | 37.88 |
|
||||
| mistral-7b-instruct-v0.1-hf | 30.30 |
|
||||
| mistral-7b-instruct-v0.2-hf | 25.25 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 30.30 |
|
64
configs/datasets/gsm8k/README.md
Normal file
64
configs/datasets/gsm8k/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# GSM8K
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | gsm8k |
|
||||
|:------------------------:|--------:|
|
||||
| llama-7b-turbomind | 10.31 |
|
||||
| llama-13b-turbomind | 20.55 |
|
||||
| llama-30b-turbomind | 42.08 |
|
||||
| llama-65b-turbomind | 54.81 |
|
||||
| llama-2-7b-turbomind | 16.76 |
|
||||
| llama-2-13b-turbomind | 29.87 |
|
||||
| llama-2-70b-turbomind | 63.53 |
|
||||
| llama-3-8b-turbomind | 54.28 |
|
||||
| llama-3-70b-turbomind | 69.98 |
|
||||
| internlm2-1.8b-turbomind | 30.40 |
|
||||
| internlm2-7b-turbomind | 69.98 |
|
||||
| internlm2-20b-turbomind | 76.80 |
|
||||
| qwen-1.8b-turbomind | 23.73 |
|
||||
| qwen-7b-turbomind | 54.36 |
|
||||
| qwen-14b-turbomind | 61.64 |
|
||||
| qwen-72b-turbomind | 79.68 |
|
||||
| qwen1.5-0.5b-hf | 13.27 |
|
||||
| qwen1.5-1.8b-hf | 34.87 |
|
||||
| qwen1.5-4b-hf | 47.61 |
|
||||
| qwen1.5-7b-hf | 54.36 |
|
||||
| qwen1.5-14b-hf | 63.53 |
|
||||
| qwen1.5-32b-hf | 72.71 |
|
||||
| qwen1.5-72b-hf | 79.53 |
|
||||
| qwen1.5-moe-a2-7b-hf | 61.26 |
|
||||
| mistral-7b-v0.1-hf | 47.61 |
|
||||
| mistral-7b-v0.2-hf | 45.19 |
|
||||
| mixtral-8x7b-v0.1-hf | 66.26 |
|
||||
| mixtral-8x22b-v0.1-hf | 82.87 |
|
||||
| yi-6b-hf | 39.58 |
|
||||
| yi-34b-hf | 67.70 |
|
||||
| deepseek-7b-base-hf | 20.17 |
|
||||
| deepseek-67b-base-hf | 68.16 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | gsm8k |
|
||||
|:-----------------------------:|--------:|
|
||||
| qwen1.5-0.5b-chat-hf | 8.79 |
|
||||
| qwen1.5-1.8b-chat-hf | 27.60 |
|
||||
| qwen1.5-4b-chat-hf | 47.61 |
|
||||
| qwen1.5-7b-chat-hf | 56.25 |
|
||||
| qwen1.5-14b-chat-hf | 64.90 |
|
||||
| qwen1.5-32b-chat-hf | 79.91 |
|
||||
| qwen1.5-72b-chat-hf | 77.03 |
|
||||
| qwen1.5-110b-chat-hf | 79.53 |
|
||||
| internlm2-chat-1.8b-hf | 39.73 |
|
||||
| internlm2-chat-1.8b-sft-hf | 36.85 |
|
||||
| internlm2-chat-7b-hf | 69.90 |
|
||||
| internlm2-chat-7b-sft-hf | 69.83 |
|
||||
| internlm2-chat-20b-hf | 75.21 |
|
||||
| internlm2-chat-20b-sft-hf | 76.95 |
|
||||
| llama-3-8b-instruct-hf | 79.53 |
|
||||
| llama-3-70b-instruct-hf | 89.76 |
|
||||
| llama-3-8b-instruct-lmdeploy | 78.77 |
|
||||
| llama-3-70b-instruct-lmdeploy | 89.31 |
|
||||
| mistral-7b-instruct-v0.1-hf | 42.23 |
|
||||
| mistral-7b-instruct-v0.2-hf | 45.56 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 65.13 |
|
64
configs/datasets/hellaswag/README.md
Normal file
64
configs/datasets/hellaswag/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# HellaSwag
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | hellaswag |
|
||||
|:------------------------:|------------:|
|
||||
| llama-7b-turbomind | 26.99 |
|
||||
| llama-13b-turbomind | 34.21 |
|
||||
| llama-30b-turbomind | 35.65 |
|
||||
| llama-65b-turbomind | 44.63 |
|
||||
| llama-2-7b-turbomind | 29.29 |
|
||||
| llama-2-13b-turbomind | 45.06 |
|
||||
| llama-2-70b-turbomind | 55.91 |
|
||||
| llama-3-8b-turbomind | 50.86 |
|
||||
| llama-3-70b-turbomind | 80.60 |
|
||||
| internlm2-1.8b-turbomind | 44.86 |
|
||||
| internlm2-7b-turbomind | 89.52 |
|
||||
| internlm2-20b-turbomind | 91.41 |
|
||||
| qwen-1.8b-turbomind | 38.04 |
|
||||
| qwen-7b-turbomind | 64.62 |
|
||||
| qwen-14b-turbomind | 85.88 |
|
||||
| qwen-72b-turbomind | 90.40 |
|
||||
| qwen1.5-0.5b-hf | 29.19 |
|
||||
| qwen1.5-1.8b-hf | 42.32 |
|
||||
| qwen1.5-4b-hf | 55.89 |
|
||||
| qwen1.5-7b-hf | 68.51 |
|
||||
| qwen1.5-14b-hf | 83.86 |
|
||||
| qwen1.5-32b-hf | 87.28 |
|
||||
| qwen1.5-72b-hf | 90.41 |
|
||||
| qwen1.5-moe-a2-7b-hf | 72.42 |
|
||||
| mistral-7b-v0.1-hf | 42.04 |
|
||||
| mistral-7b-v0.2-hf | 46.24 |
|
||||
| mixtral-8x7b-v0.1-hf | 66.22 |
|
||||
| mixtral-8x22b-v0.1-hf | 79.66 |
|
||||
| yi-6b-hf | 66.83 |
|
||||
| yi-34b-hf | 83.83 |
|
||||
| deepseek-7b-base-hf | 30.42 |
|
||||
| deepseek-67b-base-hf | 70.75 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | hellaswag |
|
||||
|:-----------------------------:|------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 29.60 |
|
||||
| qwen1.5-1.8b-chat-hf | 41.71 |
|
||||
| qwen1.5-4b-chat-hf | 60.45 |
|
||||
| qwen1.5-7b-chat-hf | 71.58 |
|
||||
| qwen1.5-14b-chat-hf | 79.70 |
|
||||
| qwen1.5-32b-chat-hf | 88.56 |
|
||||
| qwen1.5-72b-chat-hf | 89.37 |
|
||||
| qwen1.5-110b-chat-hf | 91.11 |
|
||||
| internlm2-chat-1.8b-hf | 60.47 |
|
||||
| internlm2-chat-1.8b-sft-hf | 61.58 |
|
||||
| internlm2-chat-7b-hf | 84.80 |
|
||||
| internlm2-chat-7b-sft-hf | 85.21 |
|
||||
| internlm2-chat-20b-hf | 88.48 |
|
||||
| internlm2-chat-20b-sft-hf | 88.95 |
|
||||
| llama-3-8b-instruct-hf | 74.39 |
|
||||
| llama-3-70b-instruct-hf | 89.07 |
|
||||
| llama-3-8b-instruct-lmdeploy | 73.31 |
|
||||
| llama-3-70b-instruct-lmdeploy | 87.28 |
|
||||
| mistral-7b-instruct-v0.1-hf | 53.00 |
|
||||
| mistral-7b-instruct-v0.2-hf | 65.72 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 76.16 |
|
64
configs/datasets/humaneval/README.md
Normal file
64
configs/datasets/humaneval/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# HumanEval
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:------------------------:|---------:|
|
||||
| llama-7b-turbomind | 12.80 |
|
||||
| llama-13b-turbomind | 15.24 |
|
||||
| llama-30b-turbomind | 9.15 |
|
||||
| llama-65b-turbomind | 7.32 |
|
||||
| llama-2-7b-turbomind | 14.02 |
|
||||
| llama-2-13b-turbomind | 15.24 |
|
||||
| llama-2-70b-turbomind | 15.24 |
|
||||
| llama-3-8b-turbomind | 28.05 |
|
||||
| llama-3-70b-turbomind | 28.05 |
|
||||
| internlm2-1.8b-turbomind | 30.49 |
|
||||
| internlm2-7b-turbomind | 48.17 |
|
||||
| internlm2-20b-turbomind | 51.83 |
|
||||
| qwen-1.8b-turbomind | 16.46 |
|
||||
| qwen-7b-turbomind | 23.78 |
|
||||
| qwen-14b-turbomind | 23.78 |
|
||||
| qwen-72b-turbomind | 66.46 |
|
||||
| qwen1.5-0.5b-hf | 8.54 |
|
||||
| qwen1.5-1.8b-hf | 23.17 |
|
||||
| qwen1.5-4b-hf | 41.46 |
|
||||
| qwen1.5-7b-hf | 53.05 |
|
||||
| qwen1.5-14b-hf | 57.32 |
|
||||
| qwen1.5-32b-hf | 70.12 |
|
||||
| qwen1.5-72b-hf | 65.85 |
|
||||
| qwen1.5-moe-a2-7b-hf | 45.73 |
|
||||
| mistral-7b-v0.1-hf | 14.02 |
|
||||
| mistral-7b-v0.2-hf | 9.15 |
|
||||
| mixtral-8x7b-v0.1-hf | 24.39 |
|
||||
| mixtral-8x22b-v0.1-hf | 16.46 |
|
||||
| yi-6b-hf | 14.63 |
|
||||
| yi-34b-hf | 17.07 |
|
||||
| deepseek-7b-base-hf | 18.29 |
|
||||
| deepseek-67b-base-hf | 23.17 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:-----------------------------:|---------:|
|
||||
| qwen1.5-0.5b-chat-hf | 9.15 |
|
||||
| qwen1.5-1.8b-chat-hf | 15.85 |
|
||||
| qwen1.5-4b-chat-hf | 30.49 |
|
||||
| qwen1.5-7b-chat-hf | 40.85 |
|
||||
| qwen1.5-14b-chat-hf | 50.00 |
|
||||
| qwen1.5-32b-chat-hf | 57.93 |
|
||||
| qwen1.5-72b-chat-hf | 60.37 |
|
||||
| qwen1.5-110b-chat-hf | 65.24 |
|
||||
| internlm2-chat-1.8b-hf | 33.54 |
|
||||
| internlm2-chat-1.8b-sft-hf | 34.15 |
|
||||
| internlm2-chat-7b-hf | 56.71 |
|
||||
| internlm2-chat-7b-sft-hf | 61.59 |
|
||||
| internlm2-chat-20b-hf | 67.68 |
|
||||
| internlm2-chat-20b-sft-hf | 67.68 |
|
||||
| llama-3-8b-instruct-hf | 55.49 |
|
||||
| llama-3-70b-instruct-hf | 70.73 |
|
||||
| llama-3-8b-instruct-lmdeploy | 57.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 70.73 |
|
||||
| mistral-7b-instruct-v0.1-hf | 32.32 |
|
||||
| mistral-7b-instruct-v0.2-hf | 29.27 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |
|
64
configs/datasets/math/README.md
Normal file
64
configs/datasets/math/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# MATH
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | math |
|
||||
|:------------------------:|-------:|
|
||||
| llama-7b-turbomind | 2.94 |
|
||||
| llama-13b-turbomind | 3.84 |
|
||||
| llama-30b-turbomind | 6.54 |
|
||||
| llama-65b-turbomind | 10.66 |
|
||||
| llama-2-7b-turbomind | 3.58 |
|
||||
| llama-2-13b-turbomind | 5.30 |
|
||||
| llama-2-70b-turbomind | 13.26 |
|
||||
| llama-3-8b-turbomind | 16.42 |
|
||||
| llama-3-70b-turbomind | 39.64 |
|
||||
| internlm2-1.8b-turbomind | 9.42 |
|
||||
| internlm2-7b-turbomind | 25.16 |
|
||||
| internlm2-20b-turbomind | 32.24 |
|
||||
| qwen-1.8b-turbomind | 6.30 |
|
||||
| qwen-7b-turbomind | 15.56 |
|
||||
| qwen-14b-turbomind | 30.38 |
|
||||
| qwen-72b-turbomind | 44.18 |
|
||||
| qwen1.5-0.5b-hf | 4.16 |
|
||||
| qwen1.5-1.8b-hf | 11.32 |
|
||||
| qwen1.5-4b-hf | 17.50 |
|
||||
| qwen1.5-7b-hf | 17.34 |
|
||||
| qwen1.5-14b-hf | 36.18 |
|
||||
| qwen1.5-32b-hf | 45.74 |
|
||||
| qwen1.5-72b-hf | 41.56 |
|
||||
| qwen1.5-moe-a2-7b-hf | 27.96 |
|
||||
| mistral-7b-v0.1-hf | 13.44 |
|
||||
| mistral-7b-v0.2-hf | 12.74 |
|
||||
| mixtral-8x7b-v0.1-hf | 29.46 |
|
||||
| mixtral-8x22b-v0.1-hf | 41.82 |
|
||||
| yi-6b-hf | 6.60 |
|
||||
| yi-34b-hf | 18.80 |
|
||||
| deepseek-7b-base-hf | 4.66 |
|
||||
| deepseek-67b-base-hf | 18.76 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | math |
|
||||
|:-----------------------------:|-------:|
|
||||
| qwen1.5-0.5b-chat-hf | 0.56 |
|
||||
| qwen1.5-1.8b-chat-hf | 4.94 |
|
||||
| qwen1.5-4b-chat-hf | 7.34 |
|
||||
| qwen1.5-7b-chat-hf | 22.14 |
|
||||
| qwen1.5-14b-chat-hf | 32.22 |
|
||||
| qwen1.5-32b-chat-hf | 41.80 |
|
||||
| qwen1.5-72b-chat-hf | 45.22 |
|
||||
| qwen1.5-110b-chat-hf | 54.38 |
|
||||
| internlm2-chat-1.8b-hf | 14.06 |
|
||||
| internlm2-chat-1.8b-sft-hf | 13.10 |
|
||||
| internlm2-chat-7b-hf | 28.08 |
|
||||
| internlm2-chat-7b-sft-hf | 27.60 |
|
||||
| internlm2-chat-20b-hf | 34.68 |
|
||||
| internlm2-chat-20b-sft-hf | 32.54 |
|
||||
| llama-3-8b-instruct-hf | 27.50 |
|
||||
| llama-3-70b-instruct-hf | 47.52 |
|
||||
| llama-3-8b-instruct-lmdeploy | 27.42 |
|
||||
| llama-3-70b-instruct-lmdeploy | 46.90 |
|
||||
| mistral-7b-instruct-v0.1-hf | 8.48 |
|
||||
| mistral-7b-instruct-v0.2-hf | 10.82 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 27.02 |
|
64
configs/datasets/mbpp/README.md
Normal file
64
configs/datasets/mbpp/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# MBPP
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|
||||
|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
|
||||
| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 |
|
||||
| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 |
|
||||
| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 |
|
||||
| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 |
|
||||
| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 |
|
||||
| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 |
|
||||
| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 |
|
||||
| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 |
|
||||
| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 |
|
||||
| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 |
|
||||
| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 |
|
||||
| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 |
|
||||
| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 |
|
||||
| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 |
|
||||
| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 |
|
||||
| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 |
|
||||
| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 |
|
||||
| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 |
|
||||
| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 |
|
||||
| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 |
|
||||
| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 |
|
||||
| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 |
|
||||
| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 |
|
||||
| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 |
|
||||
| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 |
|
||||
| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 |
|
||||
| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 |
|
||||
| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 |
|
||||
| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 |
|
||||
| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 |
|
||||
| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 |
|
||||
| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|
||||
|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 |
|
||||
| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 |
|
||||
| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 |
|
||||
| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 |
|
||||
| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 |
|
||||
| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 |
|
||||
| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 |
|
||||
| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 |
|
||||
| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 |
|
||||
| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 |
|
||||
| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 |
|
||||
| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 |
|
||||
| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 |
|
||||
| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 |
|
||||
| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 |
|
||||
| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 |
|
||||
| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 |
|
||||
| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 |
|
||||
| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 |
|
||||
| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 |
|
363
configs/datasets/mmlu/README.md
Normal file
363
configs/datasets/mmlu/README.md
Normal file
@ -0,0 +1,363 @@
|
||||
# MMLU
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|
||||
|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
|
||||
| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
|
||||
| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
|
||||
| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
|
||||
| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
|
||||
| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
|
||||
| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
|
||||
| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
|
||||
| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
|
||||
| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
|
||||
| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
|
||||
| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
|
||||
| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
|
||||
| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
|
||||
| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
|
||||
| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
|
||||
| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
|
||||
| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
|
||||
| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
|
||||
| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
|
||||
| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
|
||||
| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
|
||||
| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
|
||||
| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
|
||||
| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
|
||||
| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
|
||||
| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
|
||||
| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
|
||||
| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
|
||||
| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
|
||||
| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
|
||||
| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
|
||||
| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|
||||
|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
|
||||
| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
|
||||
| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
|
||||
| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
|
||||
| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
|
||||
| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
|
||||
| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
|
||||
| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
|
||||
| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
|
||||
| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
|
||||
| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
|
||||
| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
|
||||
| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
|
||||
| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
|
||||
| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
|
||||
| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
|
||||
| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
|
||||
| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
|
||||
| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
|
||||
| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
|
||||
| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
|
||||
| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
|
||||
| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
|
||||
| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
|
||||
| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
|
||||
| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
|
||||
| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
|
||||
| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
|
||||
| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
|
||||
| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
|
||||
| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
|
||||
| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
|
||||
| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
|
||||
|
||||
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|
||||
|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
|
||||
| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
|
||||
| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
|
||||
| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
|
||||
| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
|
||||
| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
|
||||
| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
|
||||
| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
|
||||
| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
|
||||
| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
|
||||
| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
|
||||
| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
|
||||
| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
|
||||
| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
|
||||
| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
|
||||
| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
|
||||
| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
|
||||
| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
|
||||
| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
|
||||
| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
|
||||
| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
|
||||
| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
|
||||
| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
|
||||
| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
|
||||
| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
|
||||
| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
|
||||
| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
|
||||
| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
|
||||
| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
|
||||
| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
|
||||
| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
|
||||
| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
|
||||
| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
|
||||
|
||||
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|
||||
|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
|
||||
| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
|
||||
| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
|
||||
| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
|
||||
| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
|
||||
| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
|
||||
| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
|
||||
| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
|
||||
| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
|
||||
| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
|
||||
| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
|
||||
| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
|
||||
| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
|
||||
| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
|
||||
| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
|
||||
| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
|
||||
| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
|
||||
| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
|
||||
| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
|
||||
| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
|
||||
| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
|
||||
| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
|
||||
| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
|
||||
| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
|
||||
| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
|
||||
| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
|
||||
| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
|
||||
| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
|
||||
| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
|
||||
| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
|
||||
| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
|
||||
| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
|
||||
| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
|
||||
|
||||
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|
||||
|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
|
||||
| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
|
||||
| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
|
||||
| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
|
||||
| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
|
||||
| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
|
||||
| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
|
||||
| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
|
||||
| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
|
||||
| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
|
||||
| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
|
||||
| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
|
||||
| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
|
||||
| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
|
||||
| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
|
||||
| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
|
||||
| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
|
||||
| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
|
||||
| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
|
||||
| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
|
||||
| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
|
||||
| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
|
||||
| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
|
||||
| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
|
||||
| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
|
||||
| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
|
||||
| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
|
||||
| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
|
||||
| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
|
||||
| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
|
||||
| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
|
||||
| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
|
||||
| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
|
||||
|
||||
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|
||||
|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
|
||||
| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
|
||||
| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
|
||||
| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
|
||||
| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
|
||||
| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
|
||||
| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
|
||||
| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
|
||||
| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
|
||||
| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
|
||||
| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
|
||||
| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
|
||||
| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
|
||||
| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
|
||||
| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
|
||||
| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
|
||||
| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
|
||||
| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
|
||||
| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
|
||||
| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
|
||||
| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
|
||||
| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
|
||||
| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
|
||||
| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
|
||||
| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
|
||||
| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
|
||||
| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
|
||||
| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
|
||||
| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
|
||||
| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
|
||||
| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
|
||||
| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
|
||||
| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|
||||
|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
|
||||
| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
|
||||
| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
|
||||
| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
|
||||
| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
|
||||
| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
|
||||
| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
|
||||
| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
|
||||
| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
|
||||
| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
|
||||
| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
|
||||
| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
|
||||
| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
|
||||
| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
|
||||
| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
|
||||
| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
|
||||
| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
|
||||
| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
|
||||
| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
|
||||
| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
|
||||
|
||||
### Details
|
||||
|
||||
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|
||||
|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
|
||||
| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
|
||||
| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
|
||||
| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
|
||||
| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
|
||||
| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
|
||||
| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
|
||||
| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
|
||||
| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
|
||||
| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
|
||||
| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
|
||||
| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
|
||||
| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
|
||||
| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
|
||||
| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
|
||||
| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
|
||||
| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
|
||||
| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
|
||||
| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
|
||||
| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
|
||||
|
||||
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|
||||
|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
|
||||
| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
|
||||
| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
|
||||
| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
|
||||
| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
|
||||
| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
|
||||
| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
|
||||
| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
|
||||
| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
|
||||
| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
|
||||
| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
|
||||
| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
|
||||
| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
|
||||
| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
|
||||
| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
|
||||
| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
|
||||
| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
|
||||
| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
|
||||
| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
|
||||
| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
|
||||
|
||||
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|
||||
|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
|
||||
| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
|
||||
| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
|
||||
| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
|
||||
| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
|
||||
| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
|
||||
| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
|
||||
| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
|
||||
| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
|
||||
| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
|
||||
| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
|
||||
| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
|
||||
| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
|
||||
| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
|
||||
| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
|
||||
| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
|
||||
| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
|
||||
| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
|
||||
| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
|
||||
| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
|
||||
|
||||
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|
||||
|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
|
||||
| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
|
||||
| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
|
||||
| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
|
||||
| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
|
||||
| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
|
||||
| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
|
||||
| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
|
||||
| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
|
||||
| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
|
||||
| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
|
||||
| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
|
||||
| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
|
||||
| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
|
||||
| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
|
||||
| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
|
||||
| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
|
||||
| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
|
||||
| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
|
||||
| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
|
||||
|
||||
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|
||||
|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
|
||||
| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
|
||||
| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
|
||||
| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
|
||||
| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
|
||||
| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
|
||||
| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
|
||||
| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
|
||||
| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
|
||||
| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
|
||||
| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
|
||||
| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
|
||||
| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
|
||||
| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
|
||||
| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
|
||||
| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
|
||||
| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
|
||||
| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
|
||||
| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
|
||||
| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |
|
64
configs/datasets/nq/README.md
Normal file
64
configs/datasets/nq/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# NQ
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | nq |
|
||||
|:------------------------:|------:|
|
||||
| llama-7b-turbomind | 15.10 |
|
||||
| llama-13b-turbomind | 16.43 |
|
||||
| llama-30b-turbomind | 22.11 |
|
||||
| llama-65b-turbomind | 26.09 |
|
||||
| llama-2-7b-turbomind | 14.35 |
|
||||
| llama-2-13b-turbomind | 21.69 |
|
||||
| llama-2-70b-turbomind | 23.27 |
|
||||
| llama-3-8b-turbomind | 18.78 |
|
||||
| llama-3-70b-turbomind | 31.88 |
|
||||
| internlm2-1.8b-turbomind | 20.66 |
|
||||
| internlm2-7b-turbomind | 41.05 |
|
||||
| internlm2-20b-turbomind | 43.55 |
|
||||
| qwen-1.8b-turbomind | 5.68 |
|
||||
| qwen-7b-turbomind | 17.87 |
|
||||
| qwen-14b-turbomind | 13.77 |
|
||||
| qwen-72b-turbomind | 18.20 |
|
||||
| qwen1.5-0.5b-hf | 6.01 |
|
||||
| qwen1.5-1.8b-hf | 10.28 |
|
||||
| qwen1.5-4b-hf | 15.73 |
|
||||
| qwen1.5-7b-hf | 18.61 |
|
||||
| qwen1.5-14b-hf | 16.07 |
|
||||
| qwen1.5-32b-hf | 21.75 |
|
||||
| qwen1.5-72b-hf | 20.53 |
|
||||
| qwen1.5-moe-a2-7b-hf | 16.62 |
|
||||
| mistral-7b-v0.1-hf | 20.66 |
|
||||
| mistral-7b-v0.2-hf | 20.78 |
|
||||
| mixtral-8x7b-v0.1-hf | 24.85 |
|
||||
| mixtral-8x22b-v0.1-hf | 34.43 |
|
||||
| yi-6b-hf | 10.08 |
|
||||
| yi-34b-hf | 13.96 |
|
||||
| deepseek-7b-base-hf | 8.45 |
|
||||
| deepseek-67b-base-hf | 17.59 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | nq |
|
||||
|:-----------------------------:|------:|
|
||||
| qwen1.5-0.5b-chat-hf | 7.42 |
|
||||
| qwen1.5-1.8b-chat-hf | 10.22 |
|
||||
| qwen1.5-4b-chat-hf | 19.31 |
|
||||
| qwen1.5-7b-chat-hf | 16.87 |
|
||||
| qwen1.5-14b-chat-hf | 20.53 |
|
||||
| qwen1.5-32b-chat-hf | 25.26 |
|
||||
| qwen1.5-72b-chat-hf | 35.21 |
|
||||
| qwen1.5-110b-chat-hf | 36.98 |
|
||||
| internlm2-chat-1.8b-hf | 19.09 |
|
||||
| internlm2-chat-1.8b-sft-hf | 18.14 |
|
||||
| internlm2-chat-7b-hf | 28.73 |
|
||||
| internlm2-chat-7b-sft-hf | 30.78 |
|
||||
| internlm2-chat-20b-hf | 28.75 |
|
||||
| internlm2-chat-20b-sft-hf | 34.10 |
|
||||
| llama-3-8b-instruct-hf | 30.17 |
|
||||
| llama-3-70b-instruct-hf | 40.25 |
|
||||
| llama-3-8b-instruct-lmdeploy | 28.28 |
|
||||
| llama-3-70b-instruct-lmdeploy | 39.14 |
|
||||
| mistral-7b-instruct-v0.1-hf | 22.47 |
|
||||
| mistral-7b-instruct-v0.2-hf | 25.18 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 32.08 |
|
64
configs/datasets/race/README.md
Normal file
64
configs/datasets/race/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# RACE
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | race-high | race-middle |
|
||||
|:------------------------:|------------:|--------------:|
|
||||
| llama-7b-turbomind | 31.30 | 29.53 |
|
||||
| llama-13b-turbomind | 35.56 | 40.25 |
|
||||
| llama-30b-turbomind | 57.35 | 55.78 |
|
||||
| llama-65b-turbomind | 70.21 | 75.35 |
|
||||
| llama-2-7b-turbomind | 39.74 | 46.73 |
|
||||
| llama-2-13b-turbomind | 57.06 | 60.52 |
|
||||
| llama-2-70b-turbomind | 79.02 | 82.17 |
|
||||
| llama-3-8b-turbomind | 67.75 | 73.61 |
|
||||
| llama-3-70b-turbomind | 85.79 | 90.25 |
|
||||
| internlm2-1.8b-turbomind | 64.72 | 70.40 |
|
||||
| internlm2-7b-turbomind | 72.56 | 74.16 |
|
||||
| internlm2-20b-turbomind | 72.90 | 74.03 |
|
||||
| qwen-1.8b-turbomind | 63.09 | 69.29 |
|
||||
| qwen-7b-turbomind | 80.30 | 85.38 |
|
||||
| qwen-14b-turbomind | 88.11 | 92.06 |
|
||||
| qwen-72b-turbomind | 90.62 | 93.59 |
|
||||
| qwen1.5-0.5b-hf | 54.66 | 60.38 |
|
||||
| qwen1.5-1.8b-hf | 67.27 | 73.33 |
|
||||
| qwen1.5-4b-hf | 78.50 | 83.29 |
|
||||
| qwen1.5-7b-hf | 82.73 | 86.70 |
|
||||
| qwen1.5-14b-hf | 87.99 | 91.85 |
|
||||
| qwen1.5-32b-hf | 90.57 | 93.25 |
|
||||
| qwen1.5-72b-hf | 90.45 | 93.87 |
|
||||
| qwen1.5-moe-a2-7b-hf | 79.56 | 83.57 |
|
||||
| mistral-7b-v0.1-hf | 73.58 | 76.25 |
|
||||
| mistral-7b-v0.2-hf | 73.67 | 77.09 |
|
||||
| mixtral-8x7b-v0.1-hf | 80.13 | 84.61 |
|
||||
| mixtral-8x22b-v0.1-hf | 86.56 | 89.62 |
|
||||
| yi-6b-hf | 82.93 | 85.72 |
|
||||
| yi-34b-hf | 90.94 | 92.76 |
|
||||
| deepseek-7b-base-hf | 50.91 | 56.82 |
|
||||
| deepseek-67b-base-hf | 83.53 | 88.23 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | race-high | race-middle |
|
||||
|:-----------------------------:|------------:|--------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 49.03 | 52.79 |
|
||||
| qwen1.5-1.8b-chat-hf | 66.24 | 72.91 |
|
||||
| qwen1.5-4b-chat-hf | 73.53 | 80.29 |
|
||||
| qwen1.5-7b-chat-hf | 83.28 | 88.09 |
|
||||
| qwen1.5-14b-chat-hf | 87.51 | 91.36 |
|
||||
| qwen1.5-32b-chat-hf | 91.22 | 93.52 |
|
||||
| qwen1.5-72b-chat-hf | 91.11 | 93.38 |
|
||||
| qwen1.5-110b-chat-hf | 92.31 | 93.66 |
|
||||
| internlm2-chat-1.8b-hf | 73.87 | 81.13 |
|
||||
| internlm2-chat-1.8b-sft-hf | 73.81 | 81.69 |
|
||||
| internlm2-chat-7b-hf | 84.51 | 88.72 |
|
||||
| internlm2-chat-7b-sft-hf | 84.88 | 89.90 |
|
||||
| internlm2-chat-20b-hf | 88.02 | 91.43 |
|
||||
| internlm2-chat-20b-sft-hf | 88.11 | 91.57 |
|
||||
| llama-3-8b-instruct-hf | 81.22 | 86.63 |
|
||||
| llama-3-70b-instruct-hf | 89.57 | 93.45 |
|
||||
| llama-3-8b-instruct-lmdeploy | 81.02 | 86.14 |
|
||||
| llama-3-70b-instruct-lmdeploy | 89.34 | 93.25 |
|
||||
| mistral-7b-instruct-v0.1-hf | 69.75 | 74.72 |
|
||||
| mistral-7b-instruct-v0.2-hf | 73.30 | 77.58 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 81.88 | 87.26 |
|
64
configs/datasets/triviaqa/README.md
Normal file
64
configs/datasets/triviaqa/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# TriviaQA
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | triviaqa |
|
||||
|:------------------------:|-----------:|
|
||||
| llama-7b-turbomind | 40.39 |
|
||||
| llama-13b-turbomind | 66.41 |
|
||||
| llama-30b-turbomind | 75.90 |
|
||||
| llama-65b-turbomind | 82.26 |
|
||||
| llama-2-7b-turbomind | 43.21 |
|
||||
| llama-2-13b-turbomind | 71.32 |
|
||||
| llama-2-70b-turbomind | 67.45 |
|
||||
| llama-3-8b-turbomind | 71.24 |
|
||||
| llama-3-70b-turbomind | 88.16 |
|
||||
| internlm2-1.8b-turbomind | 38.42 |
|
||||
| internlm2-7b-turbomind | 69.15 |
|
||||
| internlm2-20b-turbomind | 74.03 |
|
||||
| qwen-1.8b-turbomind | 22.76 |
|
||||
| qwen-7b-turbomind | 53.61 |
|
||||
| qwen-14b-turbomind | 49.72 |
|
||||
| qwen-72b-turbomind | 79.13 |
|
||||
| qwen1.5-0.5b-hf | 21.24 |
|
||||
| qwen1.5-1.8b-hf | 34.32 |
|
||||
| qwen1.5-4b-hf | 44.59 |
|
||||
| qwen1.5-7b-hf | 56.60 |
|
||||
| qwen1.5-14b-hf | 59.96 |
|
||||
| qwen1.5-32b-hf | 56.20 |
|
||||
| qwen1.5-72b-hf | 77.81 |
|
||||
| qwen1.5-moe-a2-7b-hf | 65.49 |
|
||||
| mistral-7b-v0.1-hf | 72.93 |
|
||||
| mistral-7b-v0.2-hf | 70.91 |
|
||||
| mixtral-8x7b-v0.1-hf | 85.05 |
|
||||
| mixtral-8x22b-v0.1-hf | 89.47 |
|
||||
| yi-6b-hf | 23.76 |
|
||||
| yi-34b-hf | 14.73 |
|
||||
| deepseek-7b-base-hf | 59.48 |
|
||||
| deepseek-67b-base-hf | 72.15 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | triviaqa |
|
||||
|:-----------------------------:|-----------:|
|
||||
| qwen1.5-0.5b-chat-hf | 19.84 |
|
||||
| qwen1.5-1.8b-chat-hf | 35.81 |
|
||||
| qwen1.5-4b-chat-hf | 48.93 |
|
||||
| qwen1.5-7b-chat-hf | 53.65 |
|
||||
| qwen1.5-14b-chat-hf | 62.58 |
|
||||
| qwen1.5-32b-chat-hf | 74.72 |
|
||||
| qwen1.5-72b-chat-hf | 83.25 |
|
||||
| qwen1.5-110b-chat-hf | 86.20 |
|
||||
| internlm2-chat-1.8b-hf | 46.69 |
|
||||
| internlm2-chat-1.8b-sft-hf | 46.50 |
|
||||
| internlm2-chat-7b-hf | 69.54 |
|
||||
| internlm2-chat-7b-sft-hf | 70.75 |
|
||||
| internlm2-chat-20b-hf | 75.53 |
|
||||
| internlm2-chat-20b-sft-hf | 75.90 |
|
||||
| llama-3-8b-instruct-hf | 78.99 |
|
||||
| llama-3-70b-instruct-hf | 89.79 |
|
||||
| llama-3-8b-instruct-lmdeploy | 76.77 |
|
||||
| llama-3-70b-instruct-lmdeploy | 89.62 |
|
||||
| mistral-7b-instruct-v0.1-hf | 62.94 |
|
||||
| mistral-7b-instruct-v0.2-hf | 67.72 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 79.57 |
|
64
configs/datasets/winogrande/README.md
Normal file
64
configs/datasets/winogrande/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
# WinoGrande
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | winogrande |
|
||||
|:------------------------:|-------------:|
|
||||
| llama-7b-turbomind | 71.19 |
|
||||
| llama-13b-turbomind | 76.16 |
|
||||
| llama-30b-turbomind | 80.66 |
|
||||
| llama-65b-turbomind | 82.16 |
|
||||
| llama-2-7b-turbomind | 74.03 |
|
||||
| llama-2-13b-turbomind | 76.48 |
|
||||
| llama-2-70b-turbomind | 83.98 |
|
||||
| llama-3-8b-turbomind | 77.82 |
|
||||
| llama-3-70b-turbomind | 83.43 |
|
||||
| internlm2-1.8b-turbomind | 66.77 |
|
||||
| internlm2-7b-turbomind | 83.50 |
|
||||
| internlm2-20b-turbomind | 84.69 |
|
||||
| qwen-1.8b-turbomind | 61.25 |
|
||||
| qwen-7b-turbomind | 72.06 |
|
||||
| qwen-14b-turbomind | 72.45 |
|
||||
| qwen-72b-turbomind | 82.56 |
|
||||
| qwen1.5-0.5b-hf | 57.38 |
|
||||
| qwen1.5-1.8b-hf | 60.46 |
|
||||
| qwen1.5-4b-hf | 65.90 |
|
||||
| qwen1.5-7b-hf | 70.01 |
|
||||
| qwen1.5-14b-hf | 72.93 |
|
||||
| qwen1.5-32b-hf | 78.69 |
|
||||
| qwen1.5-72b-hf | 80.74 |
|
||||
| qwen1.5-moe-a2-7b-hf | 71.43 |
|
||||
| mistral-7b-v0.1-hf | 78.30 |
|
||||
| mistral-7b-v0.2-hf | 77.51 |
|
||||
| mixtral-8x7b-v0.1-hf | 81.53 |
|
||||
| mixtral-8x22b-v0.1-hf | 86.50 |
|
||||
| yi-6b-hf | 74.35 |
|
||||
| yi-34b-hf | 79.01 |
|
||||
| deepseek-7b-base-hf | 74.11 |
|
||||
| deepseek-67b-base-hf | 79.32 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | winogrande |
|
||||
|:-----------------------------:|-------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 50.51 |
|
||||
| qwen1.5-1.8b-chat-hf | 51.07 |
|
||||
| qwen1.5-4b-chat-hf | 57.54 |
|
||||
| qwen1.5-7b-chat-hf | 65.27 |
|
||||
| qwen1.5-14b-chat-hf | 70.09 |
|
||||
| qwen1.5-32b-chat-hf | 77.90 |
|
||||
| qwen1.5-72b-chat-hf | 80.82 |
|
||||
| qwen1.5-110b-chat-hf | 82.32 |
|
||||
| internlm2-chat-1.8b-hf | 57.62 |
|
||||
| internlm2-chat-1.8b-sft-hf | 57.93 |
|
||||
| internlm2-chat-7b-hf | 73.56 |
|
||||
| internlm2-chat-7b-sft-hf | 73.80 |
|
||||
| internlm2-chat-20b-hf | 81.06 |
|
||||
| internlm2-chat-20b-sft-hf | 81.37 |
|
||||
| llama-3-8b-instruct-hf | 66.22 |
|
||||
| llama-3-70b-instruct-hf | 81.29 |
|
||||
| llama-3-8b-instruct-lmdeploy | 66.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 81.22 |
|
||||
| mistral-7b-instruct-v0.1-hf | 58.56 |
|
||||
| mistral-7b-instruct-v0.2-hf | 59.43 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 65.75 |
|
@ -1,7 +1,7 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.CHARM.charm_rea_gen_f8fca2 import charm_rea_datasets as datasets
|
||||
from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
|
||||
from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model
|
||||
|
||||
# from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
|
||||
@ -34,19 +34,19 @@ work_dir = './outputs/CHARM/chat/'
|
||||
|
||||
# dataset version metric mode internlm2-chat-7b-turbomind
|
||||
# ------------------------------------------------------------- --------- ------------- ------ -----------------------------
|
||||
# charm-rea-Direct - naive_average gen 49.51
|
||||
# charm-rea-ZH-CoT - naive_average gen 61.33
|
||||
# charm-rea-EN-CoT - naive_average gen 54.55
|
||||
# charm-rea-XLT - naive_average gen 58.46
|
||||
# charm-rea-Translate-EN - naive_average gen 56.15
|
||||
# charm-reason-Direct - naive_average gen 49.51
|
||||
# charm-reason-ZH-CoT - naive_average gen 61.33
|
||||
# charm-reason-EN-CoT - naive_average gen 54.55
|
||||
# charm-reason-XLT - naive_average gen 58.46
|
||||
# charm-reason-Translate-EN - naive_average gen 56.15
|
||||
# - - - -
|
||||
# charm-rea-Chinese_Direct - naive_average gen 47.14
|
||||
# charm-rea-Chinese_ZH-CoT - naive_average gen 58.40
|
||||
# charm-rea-Chinese_EN-CoT - naive_average gen 48.31
|
||||
# charm-rea-Chinese_XLT - naive_average gen 53.57
|
||||
# charm-rea-Chinese_Translate-EN - naive_average gen 48.21
|
||||
# charm-rea-Global_Direct - naive_average gen 51.88
|
||||
# charm-rea-Global_ZH-CoT - naive_average gen 64.26
|
||||
# charm-rea-Global_EN-CoT - naive_average gen 60.79
|
||||
# charm-rea-Global_XLT - naive_average gen 63.36
|
||||
# charm-rea-Global_Translate-EN - naive_average gen 64.10
|
||||
# charm-reason-Chinese_Direct - naive_average gen 47.14
|
||||
# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
|
||||
# charm-reason-Chinese_EN-CoT - naive_average gen 48.31
|
||||
# charm-reason-Chinese_XLT - naive_average gen 53.57
|
||||
# charm-reason-Chinese_Translate-EN - naive_average gen 48.21
|
||||
# charm-reason-Global_Direct - naive_average gen 51.88
|
||||
# charm-reason-Global_ZH-CoT - naive_average gen 64.26
|
||||
# charm-reason-Global_EN-CoT - naive_average gen 60.79
|
||||
# charm-reason-Global_XLT - naive_average gen 63.36
|
||||
# charm-reason-Global_Translate-EN - naive_average gen 64.10
|
||||
|
@ -4,7 +4,6 @@ with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
@ -5,7 +5,7 @@ with read_base():
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
@ -4,7 +4,7 @@ with read_base():
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
@ -6,7 +6,6 @@ with read_base():
|
||||
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
@ -4,7 +4,6 @@ with read_base():
|
||||
from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
@ -3,7 +3,7 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
|
@ -4,7 +4,6 @@ with read_base():
|
||||
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
|
124
configs/models/hf_internlm/README.md
Normal file
124
configs/models/hf_internlm/README.md
Normal file
@ -0,0 +1,124 @@
|
||||
# InternLM Model Details
|
||||
|
||||
## InternLM
|
||||
|
||||
InternLM is pre-trained on a large corpora with 1.6T tokens with a multi-phase progressive process, and then fine-tuned to align with human preferences. We also developed a training system called Uniscale-LLM for efficient large language model training. The evaluation on a number of benchmarks shows that InternLM achieves state-of-the-art performance in multiple aspects, including knowledge understanding, reading comprehension, mathematics, and coding. With such well-rounded capabilities, InternLM achieves outstanding performances on comprehensive exams, including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without resorting to external tools. On these benchmarks, InternLM not only significantly outperforms open-source models, but also obtains superior performance compared to ChatGPT. Also, InternLM demonstrates excellent capability of understanding Chinese language and Chinese culture, which makes it a suitable foundation model to support Chinese-oriented language applications.
|
||||
|
||||
## InternLM2
|
||||
|
||||
The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k "Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution.
|
||||
|
||||
# Evaluation Command
|
||||
|
||||
## Base Models
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets cmmlu_ppl_041cbf --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug
|
||||
python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
|
||||
```
|
||||
|
||||
## Chat Models
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets cmmlu_gen_c13365 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets lcbench_gen_5ff288 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
|
||||
```
|
||||
|
||||
# Benchmarks
|
||||
|
||||
We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
|
||||
|
||||
## Base Models
|
||||
|
||||
| dataset | internlm2-1.8b-turbomind | internlm2-7b-turbomind | internlm2-20b-turbomind |
|
||||
|:------------:|---------------------------:|-------------------------:|--------------------------:|
|
||||
| mmlu | 45.99 | 65.84 | 67.58 |
|
||||
| cmmlu | 45.27 | 66.17 | 68.28 |
|
||||
| ceval-test | 44.79 | 63.54 | 67.28 |
|
||||
| GaokaoBench | 23.78 | 41.41 | 58.99 |
|
||||
| triviaqa | 38.42 | 69.15 | 74.03 |
|
||||
| nq | 20.66 | 41.05 | 43.55 |
|
||||
| race-high | 64.72 | 72.56 | 72.90 |
|
||||
| winogrande | 66.77 | 83.50 | 84.69 |
|
||||
| hellaswag | 44.86 | 89.52 | 91.41 |
|
||||
| bbh | 36.03 | 63.56 | 71.29 |
|
||||
| gsm8k | 30.40 | 69.98 | 76.80 |
|
||||
| math | 9.42 | 25.16 | 32.24 |
|
||||
| TheoremQA | 10.50 | 21.88 | 26.00 |
|
||||
| humaneval | 30.49 | 48.17 | 51.83 |
|
||||
| mbpp | 30.74 | 54.47 | 59.92 |
|
||||
| lcbench | 4.34 | 12.16 | 18.46 |
|
||||
| GPQA_diamond | 24.24 | 28.28 | 31.31 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| dataset | internlm2-chat-1.8b-hf | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-hf | internlm2-chat-20b-sft-hf |
|
||||
|:------------:|-------------------------:|-----------------------------:|-----------------------:|---------------------------:|------------------------:|----------------------------:|
|
||||
| mmlu | 47.58 | 47.44 | 63.05 | 63.33 | 67.37 | 67.34 |
|
||||
| cmmlu | 46.11 | 46.27 | 62.10 | 62.38 | 66.26 | 66.39 |
|
||||
| ceval-test | 47.04 | 47.19 | 58.75 | 58.96 | 63.12 | 63.16 |
|
||||
| GaokaoBench | 29.73 | 28.79 | 54.54 | 55.39 | 57.95 | 57.62 |
|
||||
| triviaqa | 46.69 | 46.50 | 69.54 | 70.75 | 75.53 | 75.90 |
|
||||
| nq | 19.09 | 18.14 | 28.73 | 30.78 | 28.75 | 34.10 |
|
||||
| race-high | 73.87 | 73.81 | 84.51 | 84.88 | 88.02 | 88.11 |
|
||||
| winogrande | 57.62 | 57.93 | 73.56 | 73.80 | 81.06 | 81.37 |
|
||||
| hellaswag | 60.47 | 61.58 | 84.80 | 85.21 | 88.48 | 88.95 |
|
||||
| bbh | 37.69 | 37.12 | 57.83 | 57.19 | 68.24 | 69.38 |
|
||||
| gsm8k | 39.73 | 36.85 | 69.90 | 69.83 | 75.21 | 76.95 |
|
||||
| math | 14.06 | 13.10 | 28.08 | 27.60 | 34.68 | 32.54 |
|
||||
| TheoremQA | 13.63 | 12.88 | 18.50 | 18.75 | 23.00 | 25.12 |
|
||||
| humaneval | 33.54 | 34.15 | 56.71 | 61.59 | 67.68 | 67.68 |
|
||||
| mbpp | 39.69 | 36.19 | 57.59 | 55.64 | 68.87 | 69.65 |
|
||||
| lcbench | 4.52 | 3.56 | 14.60 | 14.34 | 19.64 | 20.55 |
|
||||
| GPQA_diamond | 25.76 | 26.26 | 28.28 | 27.27 | 30.30 | 29.29 |
|
||||
| IFEval | 18.30 | 18.67 | 34.75 | 39.19 | 36.41 | 44.55 |
|
||||
|
||||
# Citation
|
||||
|
||||
```BibTeX
|
||||
@misc{2023internlm,
|
||||
title={InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities},
|
||||
author={InternLM Team},
|
||||
howpublished = {\url{https://github.com/InternLM/InternLM-techreport}},
|
||||
year={2023}
|
||||
}
|
||||
@misc{cai2024internlm2,
|
||||
title={InternLM2 Technical Report},
|
||||
author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
|
||||
year={2024},
|
||||
eprint={2403.17297},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL}
|
||||
}
|
||||
```
|
23
configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
Normal file
23
configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
Normal file
@ -0,0 +1,23 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='internlm2-chat-1.8b-turbomind',
|
||||
path='internlm/internlm2-chat-1_8b',
|
||||
engine_config=dict(
|
||||
max_batch_size=16,
|
||||
tp=1,
|
||||
),
|
||||
gen_config=dict(
|
||||
top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
),
|
||||
max_seq_len=2048,
|
||||
max_out_len=1024,
|
||||
batch_size=32768,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
)
|
||||
]
|
@ -3,7 +3,9 @@ from opencompass.models import TurboMindModel
|
||||
settings = [
|
||||
('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
|
||||
('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
|
||||
('internlm2-base-7b-turbomind', 'internlm/internlm2-base-7b', 1),
|
||||
('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
|
||||
('internlm2-base-20b-turbomind', 'internlm/internlm2-base-20b', 2),
|
||||
]
|
||||
|
||||
models = []
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='meta-llama/Llama-2-13b-hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='meta-llama/Llama-2-13b-chat-hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='meta-llama/Llama-2-70b-hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='meta-llama/Meta-Llama-3-70B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='huggyllama/llama-13b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='huggyllama/llama-30b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='huggyllama/llama-65b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
||||
|
12
configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
Normal file
12
configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='mistral-7b-instruct-v0.3-hf',
|
||||
path='mistralai/Mistral-7B-Instruct-v0.3',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
13
configs/models/mistral/hf_mistral_7b_v0_3.py
Normal file
13
configs/models/mistral/hf_mistral_7b_v0_3.py
Normal file
@ -0,0 +1,13 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='mistral-7b-v0.3-hf',
|
||||
path='mistralai/Mistral-7B-v0.3',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
142
configs/models/qwen/README.md
Normal file
142
configs/models/qwen/README.md
Normal file
@ -0,0 +1,142 @@
|
||||
# Qwen Model Details
|
||||
|
||||
## Qwen
|
||||
|
||||
Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
|
||||
|
||||
## Qwen1.5
|
||||
|
||||
Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. In comparison with the previous released Qwen, the improvements include:
|
||||
|
||||
- 8 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, 32B and 72B dense models, and an MoE model of 14B with 2.7B activated;
|
||||
- Significant performance improvement in human preference for chat models;
|
||||
- Multilingual support of both base and chat models;
|
||||
- Stable support of 32K context length for models of all sizes
|
||||
- No need of trust_remote_code.
|
||||
|
||||
# Evaluation Command
|
||||
|
||||
## Base Models
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets mmlu_ppl_ac766d --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets cmmlu_ppl_041cbf --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets ceval_internal_ppl_93e5ce --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets nq_open_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets race_ppl_abed12 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets winogrande_5shot_ll_252f01 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets hellaswag_10shot_ppl_59c85e --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug
|
||||
```
|
||||
|
||||
## Chat Models
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets mmlu_gen_4d595a --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets cmmlu_gen_c13365 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets ceval_internal_gen_2daf24 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets GaokaoBench_no_subjective_gen_4c31db --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets nq_open_1shot_gen_01cf41 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets race_gen_69ee4f --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets winogrande_5shot_gen_b36770 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets hellaswag_10shot_gen_e42710 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets bbh_gen_5b92b0 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets gsm8k_gen_1d7fe4 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets math_0shot_gen_393424 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets humaneval_gen_8e312c --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets lcbench_gen_5ff288 --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets gpqa_gen_4baadb --debug
|
||||
python3 run.py --models hf_qwen1_5_7b_chat --datasets IFEval_gen_3321a3 --debug
|
||||
```
|
||||
|
||||
# Benchmarks
|
||||
|
||||
We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
|
||||
|
||||
## Base Models
|
||||
|
||||
| dataset | qwen-1.8b-turbomind | qwen-7b-turbomind | qwen-14b-turbomind | qwen-72b-turbomind |
|
||||
|:------------:|----------------------:|--------------------:|---------------------:|---------------------:|
|
||||
| mmlu | 46.61 | 59.75 | 67.85 | 77.36 |
|
||||
| cmmlu | 51.98 | 62.10 | 70.05 | 83.32 |
|
||||
| ceval-test | 54.24 | 62.06 | 70.33 | 83.25 |
|
||||
| GaokaoBench | 22.11 | 35.32 | 54.07 | 77.56 |
|
||||
| triviaqa | 22.76 | 53.61 | 49.72 | 79.13 |
|
||||
| nq | 5.68 | 17.87 | 13.77 | 18.20 |
|
||||
| race-high | 63.09 | 80.30 | 88.11 | 90.62 |
|
||||
| winogrande | 61.25 | 72.06 | 72.45 | 82.56 |
|
||||
| hellaswag | 38.04 | 64.62 | 85.88 | 90.40 |
|
||||
| bbh | 22.53 | 45.89 | 56.75 | 63.35 |
|
||||
| gsm8k | 23.73 | 54.36 | 61.64 | 79.68 |
|
||||
| math | 6.30 | 15.56 | 30.38 | 44.18 |
|
||||
| TheoremQA | 9.38 | 15.00 | 21.62 | 27.12 |
|
||||
| humaneval | 16.46 | 23.78 | 23.78 | 66.46 |
|
||||
| mbpp | 2.72 | 46.69 | 55.64 | 65.76 |
|
||||
| lcbench | 1.82 | 4.95 | 8.86 | 16.86 |
|
||||
| GPQA_diamond | 28.79 | 24.75 | 27.78 | 31.31 |
|
||||
|
||||
| dataset | qwen1.5-0.5b-hf | qwen1.5-1.8b-hf | qwen1.5-4b-hf | qwen1.5-7b-hf | qwen1.5-14b-hf | qwen1.5-32b-hf | qwen1.5-72b-hf |
|
||||
|:------------:|------------------:|------------------:|----------------:|----------------:|-----------------:|-----------------:|-----------------:|
|
||||
| mmlu | 39.98 | 47.14 | 57.03 | 62.15 | 69.10 | 73.88 | 77.02 |
|
||||
| cmmlu | 46.05 | 57.45 | 66.38 | 71.86 | 76.95 | 81.58 | 83.00 |
|
||||
| ceval-test | 48.36 | 58.67 | 66.55 | 72.49 | 76.93 | 82.50 | 83.03 |
|
||||
| GaokaoBench | 30.67 | 35.66 | 54.31 | 65.99 | 66.60 | 79.01 | 80.26 |
|
||||
| triviaqa | 21.24 | 34.32 | 44.59 | 56.60 | 59.96 | 56.20 | 77.81 |
|
||||
| nq | 6.01 | 10.28 | 15.73 | 18.61 | 16.07 | 21.75 | 20.53 |
|
||||
| race-high | 54.66 | 67.27 | 78.50 | 82.73 | 87.99 | 90.57 | 90.45 |
|
||||
| winogrande | 57.38 | 60.46 | 65.90 | 70.01 | 72.93 | 78.69 | 80.74 |
|
||||
| hellaswag | 29.19 | 42.32 | 55.89 | 68.51 | 83.86 | 87.28 | 90.41 |
|
||||
| bbh | 20.54 | 27.01 | 34.81 | 39.87 | 50.38 | 67.47 | 58.81 |
|
||||
| gsm8k | 13.27 | 34.87 | 47.61 | 54.36 | 63.53 | 72.71 | 79.53 |
|
||||
| math | 4.16 | 11.32 | 17.50 | 17.34 | 36.18 | 45.74 | 41.56 |
|
||||
| TheoremQA | 5.88 | 12.00 | 13.75 | 4.25 | 12.62 | 26.62 | 26.62 |
|
||||
| humaneval | 8.54 | 23.17 | 41.46 | 53.05 | 57.32 | 70.12 | 65.85 |
|
||||
| mbpp | 5.06 | 15.95 | 45.91 | 52.14 | 52.14 | 59.14 | 61.09 |
|
||||
| lcbench | 0.87 | 2.00 | 5.65 | 6.69 | 12.69 | 14.34 | 15.29 |
|
||||
| GPQA_diamond | 23.74 | 28.79 | 23.23 | 20.71 | 32.32 | 30.81 | 31.82 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| dataset | qwen1.5-0.5b-chat-hf | qwen1.5-1.8b-chat-hf | qwen1.5-4b-chat-hf | qwen1.5-7b-chat-hf | qwen1.5-14b-chat-hf | qwen1.5-32b-chat-hf | qwen1.5-72b-chat-hf | qwen1.5-110b-chat-hf |
|
||||
|:------------:|-----------------------:|-----------------------:|---------------------:|---------------------:|----------------------:|----------------------:|----------------------:|-----------------------:|
|
||||
| mmlu | 35.32 | 45.62 | 55.90 | 61.79 | 67.96 | 75.36 | 77.24 | 77.95 |
|
||||
| cmmlu | 31.55 | 48.93 | 58.53 | 68.78 | 75.07 | 80.39 | 82.48 | 86.46 |
|
||||
| ceval-test | 36.88 | 55.17 | 61.54 | 68.71 | 74.80 | 80.47 | 81.53 | 87.33 |
|
||||
| GaokaoBench | 21.51 | 46.19 | 59.11 | 70.55 | 80.39 | 86.15 | 88.58 | 89.59 |
|
||||
| triviaqa | 19.84 | 35.81 | 48.93 | 53.65 | 62.58 | 74.72 | 83.25 | 86.20 |
|
||||
| nq | 7.42 | 10.22 | 19.31 | 16.87 | 20.53 | 25.26 | 35.21 | 36.98 |
|
||||
| race-high | 49.03 | 66.24 | 73.53 | 83.28 | 87.51 | 91.22 | 91.11 | 92.31 |
|
||||
| winogrande | 50.51 | 51.07 | 57.54 | 65.27 | 70.09 | 77.90 | 80.82 | 82.32 |
|
||||
| hellaswag | 29.60 | 41.71 | 60.45 | 71.58 | 79.70 | 88.56 | 89.37 | 91.11 |
|
||||
| bbh | 24.12 | 26.82 | 43.15 | 38.12 | 55.38 | 69.28 | 72.97 | 71.04 |
|
||||
| gsm8k | 8.79 | 27.60 | 47.61 | 56.25 | 64.90 | 79.91 | 77.03 | 79.53 |
|
||||
| math | 0.56 | 4.94 | 7.34 | 22.14 | 32.22 | 41.80 | 45.22 | 54.38 |
|
||||
| TheoremQA | 9.00 | 9.25 | 13.88 | 12.25 | 13.63 | 19.25 | 22.75 | 17.50 |
|
||||
| humaneval | 9.15 | 15.85 | 30.49 | 40.85 | 50.00 | 57.93 | 60.37 | 65.24 |
|
||||
| mbpp | 11.28 | 22.57 | 43.58 | 50.58 | 56.03 | 65.37 | 66.93 | 68.48 |
|
||||
| lcbench | 0.00 | 1.65 | 5.56 | 8.78 | 14.42 | 10.78 | 18.77 | 34.58 |
|
||||
| GPQA_diamond | 19.70 | 29.80 | 25.25 | 31.82 | 30.30 | 31.31 | 32.83 | 35.86 |
|
||||
| IFEval | 13.12 | 16.08 | 25.51 | 38.82 | 42.51 | 49.54 | 51.02 | 55.08 |
|
||||
|
||||
# Citation
|
||||
|
||||
```BibTeX
|
||||
@article{qwen,
|
||||
title={Qwen Technical Report},
|
||||
author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
|
||||
journal={arXiv preprint arXiv:2309.16609},
|
||||
year={2023}
|
||||
}
|
||||
```
|
@ -1,25 +1,12 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen1.5-moe-a2-7b-hf',
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen1.5-moe-a2.7b-hf',
|
||||
path='Qwen/Qwen1.5-MoE-A2.7B',
|
||||
tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151645,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
|
@ -1,33 +1,12 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||
],
|
||||
)
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen1.5-moe-a2-7b-chat-hf',
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen1.5-moe-a2.7b-chat-hf',
|
||||
path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
meta_template=_meta_template,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
batch_padding=True,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
|
12
configs/models/qwen/hf_qwen2_0_5b.py
Normal file
12
configs/models/qwen/hf_qwen2_0_5b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen2-0.5b-hf',
|
||||
path='Qwen/Qwen2-0.5B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen/hf_qwen2_1_5b.py
Normal file
12
configs/models/qwen/hf_qwen2_1_5b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen2-1.5b-hf',
|
||||
path='Qwen/Qwen2-1.5B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen/hf_qwen2_72b.py
Normal file
12
configs/models/qwen/hf_qwen2_72b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen2-72b-hf',
|
||||
path='Qwen/Qwen2-72B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
12
configs/models/qwen/hf_qwen2_7b.py
Normal file
12
configs/models/qwen/hf_qwen2_7b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen2-7b-hf',
|
||||
path='Qwen/Qwen2-7B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen/hf_qwen2_moe_57b_a14b.py
Normal file
12
configs/models/qwen/hf_qwen2_moe_57b_a14b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='qwen2-moe-57b-a14b-hf',
|
||||
path='Qwen/Qwen2-MoE-57B-A14B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
@ -7,6 +7,6 @@ models = [
|
||||
path='Qwen/Qwen-14B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
||||
|
@ -1,12 +1,31 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-14b-chat-hf',
|
||||
path='Qwen/Qwen-14B-Chat',
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=1024,
|
||||
max_seq_len=8192,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
batch_padding=True,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
@ -1,12 +1,31 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-1.8b-chat-hf',
|
||||
path='Qwen/Qwen-1_8B-Chat',
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=1024,
|
||||
max_seq_len=8192,
|
||||
batch_size=8,
|
||||
batch_padding=True,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
@ -7,6 +7,6 @@ models = [
|
||||
path='Qwen/Qwen-72B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
||||
|
@ -1,12 +1,31 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-72b-chat-hf',
|
||||
path='Qwen/Qwen-72B-Chat',
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=1024,
|
||||
max_seq_len=8192,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
batch_padding=True,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=8),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
@ -1,12 +1,31 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-7b-chat-hf',
|
||||
path='Qwen/Qwen-7B-Chat',
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=1024,
|
||||
max_seq_len=8192,
|
||||
batch_size=8,
|
||||
batch_padding=True,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
12
configs/models/yi/hf_yi_1_5_34b.py
Normal file
12
configs/models/yi/hf_yi_1_5_34b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='yi-1.5-34b-hf',
|
||||
path='01-ai/Yi-1.5-34B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
12
configs/models/yi/hf_yi_1_5_6b.py
Normal file
12
configs/models/yi/hf_yi_1_5_6b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='yi-1.5-6b-hf',
|
||||
path='01-ai/Yi-1.5-6B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/yi/hf_yi_1_5_9b.py
Normal file
12
configs/models/yi/hf_yi_1_5_9b.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFaceBaseModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceBaseModel,
|
||||
abbr='yi-1.5-9b-hf',
|
||||
path='01-ai/Yi-1.5-9B',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -1,5 +1,5 @@
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
@ -44,7 +44,7 @@ models = [
|
||||
|
||||
judge_model = dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
|
||||
type=OpenAI, path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
url='',
|
||||
meta_template=api_meta_template,
|
||||
@ -70,7 +70,7 @@ runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalT
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAIAllesAPIN,
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
|
@ -1,98 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .groups.charm_rea import charm_rea_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'charm-rea-Direct',
|
||||
'charm-rea-ZH-CoT',
|
||||
'charm-rea-EN-CoT',
|
||||
'charm-rea-XLT',
|
||||
'charm-rea-Translate-EN',
|
||||
'',
|
||||
'charm-rea-Chinese_Direct',
|
||||
'charm-rea-Chinese_ZH-CoT',
|
||||
'charm-rea-Chinese_EN-CoT',
|
||||
'charm-rea-Chinese_XLT',
|
||||
'charm-rea-Chinese_Translate-EN',
|
||||
'charm-rea-Global_Direct',
|
||||
'charm-rea-Global_ZH-CoT',
|
||||
'charm-rea-Global_EN-CoT',
|
||||
'charm-rea-Global_XLT',
|
||||
'charm-rea-Global_Translate-EN',
|
||||
'',
|
||||
'charm-rea-Chinese_Anachronisms_Judgment_Direct',
|
||||
'charm-rea-Chinese_Movie_and_Music_Recommendation_Direct',
|
||||
'charm-rea-Chinese_Natural_Language_Inference_Direct',
|
||||
'charm-rea-Chinese_Reading_Comprehension_Direct',
|
||||
'charm-rea-Chinese_Sequence_Understanding_Direct',
|
||||
'charm-rea-Chinese_Sport_Understanding_Direct',
|
||||
'charm-rea-Chinese_Time_Understanding_Direct',
|
||||
'charm-rea-Global_Anachronisms_Judgment_Direct',
|
||||
'charm-rea-Global_Movie_and_Music_Recommendation_Direct',
|
||||
'charm-rea-Global_Natural_Language_Inference_Direct',
|
||||
'charm-rea-Global_Reading_Comprehension_Direct',
|
||||
'charm-rea-Global_Sequence_Understanding_Direct',
|
||||
'charm-rea-Global_Sport_Understanding_Direct',
|
||||
'charm-rea-Global_Time_Understanding_Direct',
|
||||
'charm-rea-Chinese_Anachronisms_Judgment_ZH-CoT',
|
||||
'charm-rea-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
|
||||
'charm-rea-Chinese_Natural_Language_Inference_ZH-CoT',
|
||||
'charm-rea-Chinese_Reading_Comprehension_ZH-CoT',
|
||||
'charm-rea-Chinese_Sequence_Understanding_ZH-CoT',
|
||||
'charm-rea-Chinese_Sport_Understanding_ZH-CoT',
|
||||
'charm-rea-Chinese_Time_Understanding_ZH-CoT',
|
||||
'charm-rea-Global_Anachronisms_Judgment_ZH-CoT',
|
||||
'charm-rea-Global_Movie_and_Music_Recommendation_ZH-CoT',
|
||||
'charm-rea-Global_Natural_Language_Inference_ZH-CoT',
|
||||
'charm-rea-Global_Reading_Comprehension_ZH-CoT',
|
||||
'charm-rea-Global_Sequence_Understanding_ZH-CoT',
|
||||
'charm-rea-Global_Sport_Understanding_ZH-CoT',
|
||||
'charm-rea-Global_Time_Understanding_ZH-CoT',
|
||||
'charm-rea-Chinese_Anachronisms_Judgment_EN-CoT',
|
||||
'charm-rea-Chinese_Movie_and_Music_Recommendation_EN-CoT',
|
||||
'charm-rea-Chinese_Natural_Language_Inference_EN-CoT',
|
||||
'charm-rea-Chinese_Reading_Comprehension_EN-CoT',
|
||||
'charm-rea-Chinese_Sequence_Understanding_EN-CoT',
|
||||
'charm-rea-Chinese_Sport_Understanding_EN-CoT',
|
||||
'charm-rea-Chinese_Time_Understanding_EN-CoT',
|
||||
'charm-rea-Global_Anachronisms_Judgment_EN-CoT',
|
||||
'charm-rea-Global_Movie_and_Music_Recommendation_EN-CoT',
|
||||
'charm-rea-Global_Natural_Language_Inference_EN-CoT',
|
||||
'charm-rea-Global_Reading_Comprehension_EN-CoT',
|
||||
'charm-rea-Global_Sequence_Understanding_EN-CoT',
|
||||
'charm-rea-Global_Sport_Understanding_EN-CoT',
|
||||
'charm-rea-Global_Time_Understanding_EN-CoT',
|
||||
'charm-rea-Chinese_Anachronisms_Judgment_XLT',
|
||||
'charm-rea-Chinese_Movie_and_Music_Recommendation_XLT',
|
||||
'charm-rea-Chinese_Natural_Language_Inference_XLT',
|
||||
'charm-rea-Chinese_Reading_Comprehension_XLT',
|
||||
'charm-rea-Chinese_Sequence_Understanding_XLT',
|
||||
'charm-rea-Chinese_Sport_Understanding_XLT',
|
||||
'charm-rea-Chinese_Time_Understanding_XLT',
|
||||
'charm-rea-Global_Anachronisms_Judgment_XLT',
|
||||
'charm-rea-Global_Movie_and_Music_Recommendation_XLT',
|
||||
'charm-rea-Global_Natural_Language_Inference_XLT',
|
||||
'charm-rea-Global_Reading_Comprehension_XLT',
|
||||
'charm-rea-Global_Sequence_Understanding_XLT',
|
||||
'charm-rea-Global_Sport_Understanding_XLT',
|
||||
'charm-rea-Global_Time_Understanding_XLT',
|
||||
'charm-rea-Chinese_Anachronisms_Judgment_Translate-EN',
|
||||
'charm-rea-Chinese_Movie_and_Music_Recommendation_Translate-EN',
|
||||
'charm-rea-Chinese_Natural_Language_Inference_Translate-EN',
|
||||
'charm-rea-Chinese_Reading_Comprehension_Translate-EN',
|
||||
'charm-rea-Chinese_Sequence_Understanding_Translate-EN',
|
||||
'charm-rea-Chinese_Sport_Understanding_Translate-EN',
|
||||
'charm-rea-Chinese_Time_Understanding_Translate-EN',
|
||||
'charm-rea-Global_Anachronisms_Judgment_Translate-EN',
|
||||
'charm-rea-Global_Movie_and_Music_Recommendation_Translate-EN',
|
||||
'charm-rea-Global_Natural_Language_Inference_Translate-EN',
|
||||
'charm-rea-Global_Reading_Comprehension_Translate-EN',
|
||||
'charm-rea-Global_Sequence_Understanding_Translate-EN',
|
||||
'charm-rea-Global_Sport_Understanding_Translate-EN',
|
||||
'charm-rea-Global_Time_Understanding_Translate-EN',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
|
||||
)
|
98
configs/summarizers/charm_reason.py
Normal file
98
configs/summarizers/charm_reason.py
Normal file
@ -0,0 +1,98 @@
|
||||
from mmengine.config import reasond_base
|
||||
|
||||
with reasond_base():
|
||||
from .groups.charm_reason import charm_reason_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'charm-reason-Direct',
|
||||
'charm-reason-ZH-CoT',
|
||||
'charm-reason-EN-CoT',
|
||||
'charm-reason-XLT',
|
||||
'charm-reason-Translate-EN',
|
||||
'',
|
||||
'charm-reason-Chinese_Direct',
|
||||
'charm-reason-Chinese_ZH-CoT',
|
||||
'charm-reason-Chinese_EN-CoT',
|
||||
'charm-reason-Chinese_XLT',
|
||||
'charm-reason-Chinese_Translate-EN',
|
||||
'charm-reason-Global_Direct',
|
||||
'charm-reason-Global_ZH-CoT',
|
||||
'charm-reason-Global_EN-CoT',
|
||||
'charm-reason-Global_XLT',
|
||||
'charm-reason-Global_Translate-EN',
|
||||
'',
|
||||
'charm-reason-Chinese_Anachronisms_Judgment_Direct',
|
||||
'charm-reason-Chinese_Movie_and_Music_Recommendation_Direct',
|
||||
'charm-reason-Chinese_Natural_Language_Inference_Direct',
|
||||
'charm-reason-Chinese_Reading_Comprehension_Direct',
|
||||
'charm-reason-Chinese_Sequence_Understanding_Direct',
|
||||
'charm-reason-Chinese_Sport_Understanding_Direct',
|
||||
'charm-reason-Chinese_Time_Understanding_Direct',
|
||||
'charm-reason-Global_Anachronisms_Judgment_Direct',
|
||||
'charm-reason-Global_Movie_and_Music_Recommendation_Direct',
|
||||
'charm-reason-Global_Natural_Language_Inference_Direct',
|
||||
'charm-reason-Global_Reading_Comprehension_Direct',
|
||||
'charm-reason-Global_Sequence_Understanding_Direct',
|
||||
'charm-reason-Global_Sport_Understanding_Direct',
|
||||
'charm-reason-Global_Time_Understanding_Direct',
|
||||
'charm-reason-Chinese_Anachronisms_Judgment_ZH-CoT',
|
||||
'charm-reason-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
|
||||
'charm-reason-Chinese_Natural_Language_Inference_ZH-CoT',
|
||||
'charm-reason-Chinese_Reading_Comprehension_ZH-CoT',
|
||||
'charm-reason-Chinese_Sequence_Understanding_ZH-CoT',
|
||||
'charm-reason-Chinese_Sport_Understanding_ZH-CoT',
|
||||
'charm-reason-Chinese_Time_Understanding_ZH-CoT',
|
||||
'charm-reason-Global_Anachronisms_Judgment_ZH-CoT',
|
||||
'charm-reason-Global_Movie_and_Music_Recommendation_ZH-CoT',
|
||||
'charm-reason-Global_Natural_Language_Inference_ZH-CoT',
|
||||
'charm-reason-Global_Reading_Comprehension_ZH-CoT',
|
||||
'charm-reason-Global_Sequence_Understanding_ZH-CoT',
|
||||
'charm-reason-Global_Sport_Understanding_ZH-CoT',
|
||||
'charm-reason-Global_Time_Understanding_ZH-CoT',
|
||||
'charm-reason-Chinese_Anachronisms_Judgment_EN-CoT',
|
||||
'charm-reason-Chinese_Movie_and_Music_Recommendation_EN-CoT',
|
||||
'charm-reason-Chinese_Natural_Language_Inference_EN-CoT',
|
||||
'charm-reason-Chinese_Reading_Comprehension_EN-CoT',
|
||||
'charm-reason-Chinese_Sequence_Understanding_EN-CoT',
|
||||
'charm-reason-Chinese_Sport_Understanding_EN-CoT',
|
||||
'charm-reason-Chinese_Time_Understanding_EN-CoT',
|
||||
'charm-reason-Global_Anachronisms_Judgment_EN-CoT',
|
||||
'charm-reason-Global_Movie_and_Music_Recommendation_EN-CoT',
|
||||
'charm-reason-Global_Natural_Language_Inference_EN-CoT',
|
||||
'charm-reason-Global_Reading_Comprehension_EN-CoT',
|
||||
'charm-reason-Global_Sequence_Understanding_EN-CoT',
|
||||
'charm-reason-Global_Sport_Understanding_EN-CoT',
|
||||
'charm-reason-Global_Time_Understanding_EN-CoT',
|
||||
'charm-reason-Chinese_Anachronisms_Judgment_XLT',
|
||||
'charm-reason-Chinese_Movie_and_Music_Recommendation_XLT',
|
||||
'charm-reason-Chinese_Natural_Language_Inference_XLT',
|
||||
'charm-reason-Chinese_Reading_Comprehension_XLT',
|
||||
'charm-reason-Chinese_Sequence_Understanding_XLT',
|
||||
'charm-reason-Chinese_Sport_Understanding_XLT',
|
||||
'charm-reason-Chinese_Time_Understanding_XLT',
|
||||
'charm-reason-Global_Anachronisms_Judgment_XLT',
|
||||
'charm-reason-Global_Movie_and_Music_Recommendation_XLT',
|
||||
'charm-reason-Global_Natural_Language_Inference_XLT',
|
||||
'charm-reason-Global_Reading_Comprehension_XLT',
|
||||
'charm-reason-Global_Sequence_Understanding_XLT',
|
||||
'charm-reason-Global_Sport_Understanding_XLT',
|
||||
'charm-reason-Global_Time_Understanding_XLT',
|
||||
'charm-reason-Chinese_Anachronisms_Judgment_Translate-EN',
|
||||
'charm-reason-Chinese_Movie_and_Music_Recommendation_Translate-EN',
|
||||
'charm-reason-Chinese_Natural_Language_Inference_Translate-EN',
|
||||
'charm-reason-Chinese_Reading_Comprehension_Translate-EN',
|
||||
'charm-reason-Chinese_Sequence_Understanding_Translate-EN',
|
||||
'charm-reason-Chinese_Sport_Understanding_Translate-EN',
|
||||
'charm-reason-Chinese_Time_Understanding_Translate-EN',
|
||||
'charm-reason-Global_Anachronisms_Judgment_Translate-EN',
|
||||
'charm-reason-Global_Movie_and_Music_Recommendation_Translate-EN',
|
||||
'charm-reason-Global_Natural_Language_Inference_Translate-EN',
|
||||
'charm-reason-Global_Reading_Comprehension_Translate-EN',
|
||||
'charm-reason-Global_Sequence_Understanding_Translate-EN',
|
||||
'charm-reason-Global_Sport_Understanding_Translate-EN',
|
||||
'charm-reason-Global_Time_Understanding_Translate-EN',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
|
||||
)
|
@ -122,10 +122,10 @@ IFEval_dataset_abbrs = [
|
||||
['IFEval', 'Inst-level-loose-accuracy'],
|
||||
]
|
||||
|
||||
|
||||
summarizer = dict(
|
||||
type=MultiFacetedSummarizer,
|
||||
dataset_abbrs_list=[
|
||||
{'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
|
||||
{'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
|
||||
{'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
|
||||
{'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
|
||||
@ -143,7 +143,6 @@ summarizer = dict(
|
||||
{'name': 'humaneval', 'dataset_abbrs': [['openai_humaneval', 'humaneval_pass@1']]},
|
||||
{'name': 'GPQA', 'dataset_abbrs': [['GPQA_diamond', 'accuracy']]},
|
||||
{'name': 'IFEval', 'dataset_abbrs': IFEval_dataset_abbrs},
|
||||
{'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
|
||||
],
|
||||
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
@ -1,30 +0,0 @@
|
||||
charm_tasks = [
|
||||
'Anachronisms_Judgment',
|
||||
'Movie_and_Music_Recommendation',
|
||||
'Natural_Language_Inference',
|
||||
'Reading_Comprehension',
|
||||
'Sequence_Understanding',
|
||||
'Sport_Understanding',
|
||||
'Time_Understanding',
|
||||
]
|
||||
regions = [
|
||||
'Chinese',
|
||||
'Global',
|
||||
]
|
||||
prompts = [
|
||||
'Direct',
|
||||
'ZH-CoT',
|
||||
'EN-CoT',
|
||||
'XLT',
|
||||
'Translate-EN',
|
||||
]
|
||||
|
||||
|
||||
charm_rea_summary_groups = []
|
||||
for prompt in prompts:
|
||||
for region in regions:
|
||||
subsets = ['charm-rea-' + region + '_' + task + '_' + prompt for task in charm_tasks]
|
||||
charm_rea_summary_groups.append({'name': 'charm-rea-' + region + '_' + prompt, 'subsets': subsets})
|
||||
for prompt in prompts:
|
||||
subsets = ['charm-rea-' + region + '_' + prompt for region in regions]
|
||||
charm_rea_summary_groups.append({'name': 'charm-rea-' + prompt, 'subsets': subsets})
|
35
configs/summarizers/groups/charm_reason.py
Normal file
35
configs/summarizers/groups/charm_reason.py
Normal file
@ -0,0 +1,35 @@
|
||||
charm_tasks = [
|
||||
'Anachronisms_Judgment',
|
||||
'Movie_and_Music_Recommendation',
|
||||
'Natural_Language_Inference',
|
||||
'Reading_Comprehension',
|
||||
'Sequence_Understanding',
|
||||
'Sport_Understanding',
|
||||
'Time_Understanding',
|
||||
]
|
||||
regions = [
|
||||
'Chinese',
|
||||
'Global',
|
||||
]
|
||||
prompts = [
|
||||
'Direct',
|
||||
'ZH-CoT',
|
||||
'EN-CoT',
|
||||
'XLT',
|
||||
'Translate-EN',
|
||||
]
|
||||
|
||||
|
||||
charm_reaso_summary_groups = []
|
||||
for prompt in prompts:
|
||||
for region in regions:
|
||||
subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
|
||||
charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
|
||||
|
||||
for prompt in prompts:
|
||||
subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
|
||||
charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
|
||||
|
||||
charm_reaso_summary_groups.append(
|
||||
{'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
|
||||
)
|
26
configs/summarizers/groups/mathbench_2024.py
Normal file
26
configs/summarizers/groups/mathbench_2024.py
Normal file
@ -0,0 +1,26 @@
|
||||
|
||||
mathbench_2024_wocircular_summary_groups = [
|
||||
{'name': 'college', 'subsets': ['college-single_choice_cn', 'college-single_choice_en']},
|
||||
{'name': 'high', 'subsets': ['high-single_choice_cn', 'high-single_choice_en']},
|
||||
{'name': 'middle', 'subsets': ['middle-single_choice_cn', 'middle-single_choice_en']},
|
||||
{'name': 'primary', 'subsets': ['primary-cloze_cn', 'primary-cloze_en']},
|
||||
{'name': 'cn', 'subsets': ['college-single_choice_cn', 'high-single_choice_cn', 'middle-single_choice_cn', 'primary-cloze_cn']},
|
||||
{'name': 'en', 'subsets': ['college-single_choice_en', 'high-single_choice_en', 'middle-single_choice_en', 'primary-cloze_en']},
|
||||
{'name': 'a', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic-cloze_en']},
|
||||
|
||||
{'name': 'college_knowledge', 'subsets': ['college_knowledge-single_choice_cn', 'college_knowledge-single_choice_en']},
|
||||
{'name': 'high_knowledge', 'subsets': ['high_knowledge-single_choice_cn', 'high_knowledge-single_choice_en']},
|
||||
{'name': 'middle_knowledge', 'subsets': ['middle_knowledge-single_choice_cn', 'middle_knowledge-single_choice_en']},
|
||||
{'name': 'primary_knowledge', 'subsets': ['primary_knowledge-single_choice_cn', 'primary_knowledge-single_choice_en']},
|
||||
{'name': 'knowledge-cn', 'subsets': ['college_knowledge-single_choice_cn', 'high_knowledge-single_choice_cn', 'middle_knowledge-single_choice_cn', 'primary_knowledge-single_choice_cn']},
|
||||
{'name': 'knowledge-en', 'subsets': ['college_knowledge-single_choice_en', 'high_knowledge-single_choice_en', 'middle_knowledge-single_choice_en', 'primary_knowledge-single_choice_en']},
|
||||
{'name': 't', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
|
||||
|
||||
{'name': 'overall', 'subsets': ['a', 't']},
|
||||
]
|
||||
|
||||
for g in mathbench_2024_wocircular_summary_groups:
|
||||
g['name'] = 'mathbench-wocircular-' + g['name']
|
||||
g['subsets'] = ['mathbench-wocircular-' + s for s in g['subsets']]
|
||||
|
||||
mathbench_2024_summary_groups = mathbench_2024_wocircular_summary_groups
|
@ -1 +1 @@
|
||||
__version__ = '0.2.4'
|
||||
__version__ = '0.2.5'
|
||||
|
@ -11,8 +11,8 @@ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('charm-rea')
|
||||
def charm_rea_postprocess(text: str) -> str:
|
||||
@TEXT_POSTPROCESSORS.register_module('charm-reason')
|
||||
def charm_reason_postprocess(text: str) -> str:
|
||||
ans = text
|
||||
ans_line = ans.split('answer is ')
|
||||
if len(ans_line) != 1:
|
||||
@ -27,14 +27,11 @@ def charm_rea_postprocess(text: str) -> str:
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class CharmReaEvaluator(BaseEvaluator):
|
||||
class CharmReasonEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error': 'predictions and references have different '
|
||||
'length'
|
||||
}
|
||||
return {'error': 'preds and refrs have different length'}
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
@ -43,9 +40,7 @@ class CharmReaEvaluator(BaseEvaluator):
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score, 'details': details}
|
||||
|
||||
|
||||
|
@ -208,7 +208,8 @@ class MBPPEvaluator(BaseEvaluator):
|
||||
assert self.metric in ['MBPP', 'MBPPPlus']
|
||||
|
||||
def score(self, predictions, references):
|
||||
assert len(predictions) == len(references)
|
||||
if len(predictions) != len(references):
|
||||
return {'error': 'preds and refrs have different length'}
|
||||
|
||||
if self.metric == 'MBPP':
|
||||
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
|
||||
@ -286,6 +287,8 @@ class MBPPEvaluator(BaseEvaluator):
|
||||
r'BEGIN\s*(.*)\s*DONE',
|
||||
r'```python\s*(.*)\s*```',
|
||||
r'```\s*(.*)\s*```',
|
||||
r'```python\s*(.*)\s*$',
|
||||
r'```\s*(.*)\s*$',
|
||||
r'(.*)\s*```.*',
|
||||
r"\[BEGIN\]\s*'(.*)",
|
||||
r'\[BEGIN\](.*)',
|
||||
|
@ -18,7 +18,7 @@ from .huggingface_above_v4_33 import HuggingFacewithChatTemplate # noqa: F401
|
||||
from .hunyuan_api import Hunyuan # noqa: F401
|
||||
from .intern_model import InternLM # noqa: F401
|
||||
from .krgpt_api import KrGPT # noqa: F401
|
||||
from .lightllm_api import LightllmAPI # noqa: F401
|
||||
from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401
|
||||
from .llama2 import Llama2, Llama2Chat # noqa: F401
|
||||
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
|
||||
from .lmdeploy_tis import LmdeployTisModel # noqa: F401
|
||||
|
@ -84,7 +84,6 @@ def _convert_chat_messages(inputs, merge_role=True):
|
||||
messages = merged_messages
|
||||
|
||||
outputs.append(messages)
|
||||
print(messages)
|
||||
return outputs
|
||||
|
||||
|
||||
|
@ -1,16 +1,20 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from opencompass.registry import MODELS
|
||||
from opencompass.utils.logging import get_logger
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
from .base import BaseModel
|
||||
from .base_api import TokenBucket
|
||||
from .base_api import BaseAPIModel, TokenBucket
|
||||
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
@MODELS.register_module()
|
||||
@ -189,3 +193,158 @@ class LightllmAPI(BaseModel):
|
||||
chinese_count = sum(len(part) for part in chinese_parts)
|
||||
|
||||
return english_count + chinese_count
|
||||
|
||||
|
||||
class LightllmChatAPI(BaseAPIModel):
|
||||
"""Model wrapper around YiAPI.
|
||||
|
||||
Documentation:
|
||||
|
||||
Args:
|
||||
path (str): The name of YiAPI model.
|
||||
e.g. `moonshot-v1-32k`
|
||||
key (str): Authorization key.
|
||||
query_per_second (int): The maximum queries allowed per second
|
||||
between two consecutive calls of the API. Defaults to 1.
|
||||
max_seq_len (int): Unused here.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
retry (int): Number of retires if the API call fails. Defaults to 2.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
url: str,
|
||||
query_per_second: int = 2,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
retry: int = 2,
|
||||
):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
meta_template=meta_template,
|
||||
retry=retry)
|
||||
self.url = url
|
||||
self.model = path
|
||||
|
||||
def generate(
|
||||
self,
|
||||
inputs: List[PromptType],
|
||||
max_out_len: int = 512,
|
||||
) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (List[PromptType]): A list of strings or PromptDicts.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
with ThreadPoolExecutor() as executor:
|
||||
results = list(
|
||||
executor.map(self._generate, inputs,
|
||||
[max_out_len] * len(inputs)))
|
||||
self.flush()
|
||||
return results
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
input: PromptType,
|
||||
max_out_len: int = 512,
|
||||
) -> str:
|
||||
"""Generate results given an input.
|
||||
|
||||
Args:
|
||||
inputs (PromptType): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert isinstance(input, (str, PromptList))
|
||||
|
||||
if isinstance(input, str):
|
||||
messages = [{'role': 'user', 'content': input}]
|
||||
else:
|
||||
messages = []
|
||||
msg_buffer, last_role = [], None
|
||||
for item in input:
|
||||
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
|
||||
if item['role'] != last_role and last_role is not None:
|
||||
messages.append({
|
||||
'content': '\n'.join(msg_buffer),
|
||||
'role': last_role
|
||||
})
|
||||
msg_buffer = []
|
||||
msg_buffer.append(item['prompt'])
|
||||
last_role = item['role']
|
||||
messages.append({
|
||||
'content': '\n'.join(msg_buffer),
|
||||
'role': last_role
|
||||
})
|
||||
|
||||
data = {'messages': messages}
|
||||
|
||||
max_num_retries = 0
|
||||
while max_num_retries < self.retry:
|
||||
self.acquire()
|
||||
try:
|
||||
raw_response = requests.request('POST',
|
||||
url=self.url,
|
||||
json=data)
|
||||
except Exception as err:
|
||||
print('Request Error:{}'.format(err))
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
try:
|
||||
response = raw_response.json()
|
||||
except Exception as err:
|
||||
print('Response Error:{}'.format(err))
|
||||
response = None
|
||||
self.release()
|
||||
|
||||
if response is None:
|
||||
print('Connection error, reconnect.')
|
||||
# if connect error, frequent requests will casuse
|
||||
# continuous unstable network, therefore wait here
|
||||
# to slow down the request
|
||||
self.wait()
|
||||
continue
|
||||
|
||||
if raw_response.status_code == 200:
|
||||
# msg = json.load(response.text)
|
||||
# response
|
||||
msg = response['choices'][0]['message']['content']
|
||||
self.logger.debug(f'Generated: {msg}')
|
||||
return msg
|
||||
|
||||
if raw_response.status_code == 401:
|
||||
print('请求被拒绝 api_key错误')
|
||||
continue
|
||||
elif raw_response.status_code == 400:
|
||||
print(messages, response)
|
||||
print('请求失败,状态码:', raw_response)
|
||||
msg = 'The request was rejected because high risk'
|
||||
return msg
|
||||
elif raw_response.status_code == 429:
|
||||
print(messages, response)
|
||||
print('请求失败,状态码:', raw_response)
|
||||
time.sleep(5)
|
||||
continue
|
||||
else:
|
||||
print(messages, response)
|
||||
print('请求失败,状态码:', raw_response)
|
||||
time.sleep(1)
|
||||
|
||||
max_num_retries += 1
|
||||
|
||||
raise RuntimeError(raw_response)
|
||||
|
@ -50,6 +50,9 @@ class OpenAI(BaseAPIModel):
|
||||
temperature (float, optional): What sampling temperature to use.
|
||||
If not None, will override the temperature in the `generate()`
|
||||
call. Defaults to None.
|
||||
tokenizer_path (str, optional): The path to the tokenizer. Use path if
|
||||
'tokenizer_path' is None, otherwise use the 'tokenizer_path'.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
is_api: bool = True
|
||||
@ -67,7 +70,8 @@ class OpenAI(BaseAPIModel):
|
||||
mode: str = 'none',
|
||||
logprobs: Optional[bool] = False,
|
||||
top_logprobs: Optional[int] = None,
|
||||
temperature: Optional[float] = None):
|
||||
temperature: Optional[float] = None,
|
||||
tokenizer_path: Optional[str] = None):
|
||||
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
@ -82,6 +86,7 @@ class OpenAI(BaseAPIModel):
|
||||
self.mode = mode
|
||||
self.logprobs = logprobs
|
||||
self.top_logprobs = top_logprobs
|
||||
self.tokenizer_path = tokenizer_path
|
||||
|
||||
if isinstance(key, str):
|
||||
if key == 'ENV':
|
||||
@ -287,7 +292,8 @@ class OpenAI(BaseAPIModel):
|
||||
Returns:
|
||||
int: Length of the input tokens
|
||||
"""
|
||||
enc = self.tiktoken.encoding_for_model(self.path)
|
||||
enc = self.tiktoken.encoding_for_model(self.path
|
||||
or self.tokenizer_path)
|
||||
return len(enc.encode(prompt))
|
||||
|
||||
def bin_trim(self, prompt: str, num_token: int) -> str:
|
||||
@ -333,140 +339,3 @@ class OpenAI(BaseAPIModel):
|
||||
elif self.mode == 'rear':
|
||||
prompt = sep.join(words[:l])
|
||||
return prompt
|
||||
|
||||
|
||||
class OpenAIAllesAPIN(OpenAI):
|
||||
"""Model wrapper around OpenAI-AllesAPIN.
|
||||
|
||||
Args:
|
||||
path (str): The name of OpenAI's model.
|
||||
url (str): URL to AllesAPIN.
|
||||
key (str): AllesAPIN key.
|
||||
query_per_second (int): The maximum queries allowed per second
|
||||
between two consecutive calls of the API. Defaults to 1.
|
||||
max_seq_len (int): Unused here.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
retry (int): Number of retires if the API call fails. Defaults to 2.
|
||||
"""
|
||||
|
||||
is_api: bool = True
|
||||
|
||||
def __init__(self,
|
||||
path: str,
|
||||
url: str,
|
||||
key: str,
|
||||
temperature: float = 1.0,
|
||||
query_per_second: int = 1,
|
||||
rpm_verbose: bool = False,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
retry: int = 2):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
rpm_verbose=rpm_verbose,
|
||||
meta_template=meta_template,
|
||||
retry=retry)
|
||||
self.url = url
|
||||
self.temperature = temperature
|
||||
self.headers = {
|
||||
'alles-apin-token': key,
|
||||
'content-type': 'application/json',
|
||||
}
|
||||
|
||||
def _generate(self, input: PromptType, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
"""Generate results given an input.
|
||||
|
||||
Args:
|
||||
inputs (PromptType): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert isinstance(input, (str, PromptList))
|
||||
|
||||
if isinstance(input, str):
|
||||
messages = [{'role': 'user', 'content': input}]
|
||||
else:
|
||||
messages = []
|
||||
for item in input:
|
||||
msg = {'content': item['prompt']}
|
||||
if item['role'] == 'HUMAN':
|
||||
msg['role'] = 'user'
|
||||
elif item['role'] == 'BOT':
|
||||
msg['role'] = 'assistant'
|
||||
elif item['role'] == 'SYSTEM':
|
||||
msg['role'] = 'system'
|
||||
messages.append(msg)
|
||||
|
||||
# model can be response with user and system
|
||||
# when it comes with agent involved.
|
||||
assert msg['role'] in ['user', 'system']
|
||||
|
||||
data = {
|
||||
'model': self.path,
|
||||
'messages': messages,
|
||||
'temperature': temperature
|
||||
}
|
||||
for _ in range(self.retry):
|
||||
self.wait()
|
||||
try:
|
||||
raw_response = requests.post(self.url,
|
||||
headers=self.headers,
|
||||
data=json.dumps(data))
|
||||
except requests.ConnectionError:
|
||||
self.logger.error('Request error, got',
|
||||
str(raw_response.content))
|
||||
time.sleep(1)
|
||||
continue
|
||||
try:
|
||||
response = raw_response.json()
|
||||
except requests.JSONDecodeError:
|
||||
self.logger.error('JsonDecode error, got',
|
||||
str(raw_response.content))
|
||||
time.sleep(1)
|
||||
continue
|
||||
if raw_response.status_code == 200 and response[
|
||||
'msgCode'] == '10000':
|
||||
data = response['data']
|
||||
choices = data['choices']
|
||||
if choices is None:
|
||||
self.logger.error(data)
|
||||
else:
|
||||
return choices[0]['message']['content'].strip()
|
||||
try:
|
||||
match = re.match(r'Error code: \d+ - (.*)', response['data'])
|
||||
err = eval(match.group(1))['error']
|
||||
if err['code'] == 'content_filter' and err['status'] == 400:
|
||||
return err['message']
|
||||
except Exception:
|
||||
pass
|
||||
self.logger.error(response['msg'])
|
||||
self.logger.error(response)
|
||||
time.sleep(1)
|
||||
|
||||
raise RuntimeError('API call failed.')
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
"""Get lengths of the tokenized string. Only English and Chinese
|
||||
characters are counted for now. Users are encouraged to override this
|
||||
method if more accurate length is needed.
|
||||
|
||||
Args:
|
||||
prompt (str): Input string.
|
||||
|
||||
Returns:
|
||||
int: Length of the input tokens
|
||||
"""
|
||||
enc = self.tiktoken.encoding_for_model(self.path)
|
||||
return len(enc.encode(prompt))
|
||||
|
@ -145,6 +145,8 @@ class PPLInferencerOutputHandler:
|
||||
def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
|
||||
if str(idx) not in self.results_dict.keys():
|
||||
self.results_dict[str(idx)] = {}
|
||||
if 'origin_prompt' not in self.results_dict[str(idx)]:
|
||||
self.results_dict[str(idx)]['origin_prompt'] = input
|
||||
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
|
||||
self.results_dict[str(idx)]['label: ' + str(label)] = {}
|
||||
self.results_dict[str(idx)]['label: ' +
|
||||
|
@ -161,6 +161,9 @@ class DLCRunner(BaseRunner):
|
||||
shell_cmd += 'umask 0000; '
|
||||
shell_cmd += '{task_cmd}'
|
||||
|
||||
# set priority to 1 as default
|
||||
task_priority = self.aliyun_cfg.get('priority', 1)
|
||||
|
||||
tmpl = (
|
||||
'dlc submit pytorchjob'
|
||||
f" --command '{shell_cmd}'"
|
||||
@ -168,6 +171,7 @@ class DLCRunner(BaseRunner):
|
||||
f" --config {self.aliyun_cfg['dlc_config_path']}"
|
||||
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
|
||||
f" --resource_id {self.aliyun_cfg['resource_id']}"
|
||||
f' --priority {task_priority}'
|
||||
' --workers 1'
|
||||
f' --worker_cpu {max(num_gpus * 8, 12)}'
|
||||
f' --worker_gpu {num_gpus}'
|
||||
|
@ -226,7 +226,7 @@ class DefaultSummarizer:
|
||||
|
||||
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
|
||||
|
||||
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None):
|
||||
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None, skip_all_slash=False):
|
||||
dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
|
||||
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}
|
||||
|
||||
@ -257,14 +257,16 @@ class DefaultSummarizer:
|
||||
table.append(header)
|
||||
for dataset_abbr, metric in summarizer_dataset_abbrs:
|
||||
if dataset_abbr not in dataset_metrics:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
if not skip_all_slash:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
continue
|
||||
if metric is None:
|
||||
metric = dataset_metrics[dataset_abbr][0]
|
||||
elif metric in dataset_metrics[dataset_abbr]:
|
||||
pass
|
||||
else:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
if not skip_all_slash:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
continue
|
||||
|
||||
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
|
||||
|
@ -1,11 +1,9 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import functools
|
||||
import getpass
|
||||
import math
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import tabulate
|
||||
from mmengine import ConfigDict
|
||||
@ -33,7 +31,9 @@ class MultiFacetedSummarizer(DefaultSummarizer):
|
||||
profile_dataset_abbrs = dataset_abbrs_item['dataset_abbrs']
|
||||
|
||||
# format table
|
||||
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs)
|
||||
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs, skip_all_slash=True)
|
||||
if len(table) == 1:
|
||||
continue
|
||||
|
||||
# output to screen
|
||||
print(tabulate.tabulate(table, headers='firstrow', floatfmt='.2f'))
|
||||
|
@ -214,6 +214,8 @@ class OpenICLEvalTask(BaseTask):
|
||||
preds['references'] = (test_set[self.output_column]
|
||||
if self.output_column else None)
|
||||
preds['test_set'] = test_set
|
||||
if 'origin_prompt' not in preds:
|
||||
preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
|
||||
preds = {
|
||||
k: preds[k]
|
||||
for k in signature(icl_evaluator.score).parameters
|
||||
|
@ -258,7 +258,7 @@ def change_accelerator(models, accelerator):
|
||||
path=model['path'],
|
||||
model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']),
|
||||
max_out_len=model['max_out_len'],
|
||||
batch_size=32768,
|
||||
batch_size=16,
|
||||
run_cfg=model['run_cfg'],
|
||||
stop_words=model.get('stop_words', []),
|
||||
)
|
||||
@ -272,7 +272,7 @@ def change_accelerator(models, accelerator):
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
|
||||
max_seq_len=model.get('max_seq_len', 2048),
|
||||
max_out_len=model['max_out_len'],
|
||||
batch_size=32768,
|
||||
batch_size=16,
|
||||
run_cfg=model['run_cfg'],
|
||||
stop_words=model.get('stop_words', []),
|
||||
)
|
||||
|
@ -63,15 +63,15 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
||||
# yapf: disable
|
||||
# flake8: noqa: W605
|
||||
patterns = [
|
||||
f'答案是?\s?([{options}])',
|
||||
f'答案是?\s?:([{options}])',
|
||||
f'答案是?\s?:([{options}])',
|
||||
f'答案应该?是\s?([{options}])',
|
||||
f'答案应该?选\s?([{options}])',
|
||||
f'答案为\s?([{options}])',
|
||||
f'答案选\s?([{options}])',
|
||||
f'选择?\s?([{options}])',
|
||||
f'故选?\s?([{options}])'
|
||||
f'答案是?\s*([{options}])',
|
||||
f'答案是?\s*:\s*([{options}])',
|
||||
f'答案是?\s*:\s*([{options}])',
|
||||
f'答案应该?是\s*([{options}])',
|
||||
f'答案应该?选\s*([{options}])',
|
||||
f'答案为\s*([{options}])',
|
||||
f'答案选\s*([{options}])',
|
||||
f'选择?\s*([{options}])',
|
||||
f'故选?\s*([{options}])'
|
||||
f'只有选?项?\s?([{options}])\s?是?对',
|
||||
f'只有选?项?\s?([{options}])\s?是?错',
|
||||
f'只有选?项?\s?([{options}])\s?不?正确',
|
||||
|
@ -22,6 +22,7 @@ OpenCC
|
||||
opencv-python-headless
|
||||
pandas<2.0.0
|
||||
prettytable
|
||||
protobuf
|
||||
pyext
|
||||
pypinyin
|
||||
python-Levenshtein
|
||||
|
Loading…
Reference in New Issue
Block a user