mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge branch 'open-compass:main' into main
This commit is contained in:
commit
76ea7b4d4d
@ -59,6 +59,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
||||
|
||||
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||
|
||||
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
|
||||
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
|
||||
- **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥
|
||||
- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
|
||||
- **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
|
||||
@ -191,6 +193,8 @@ After ensuring that OpenCompass is installed correctly according to the above st
|
||||
|
||||
# Python scripts
|
||||
opencompass ./configs/eval_api_demo.py
|
||||
|
||||
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
|
||||
```
|
||||
|
||||
- Accelerated Evaluation
|
||||
|
@ -59,6 +59,8 @@
|
||||
|
||||
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||
|
||||
- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥
|
||||
- **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥
|
||||
- **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥
|
||||
- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
|
||||
- **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
|
||||
@ -187,6 +189,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
|
||||
|
||||
# Python 脚本
|
||||
opencompass ./configs/eval_api_demo.py
|
||||
|
||||
|
||||
# 现已支持 o1_mini_2024_09_12/o1_preview_2024_09_12 模型, 默认情况下 max_completion_tokens=8192.
|
||||
```
|
||||
|
||||
- ### 推理后端
|
||||
|
81
configs/datasets/MathBench/mathbench_2024_gen_50a320.py
Normal file
81
configs/datasets/MathBench/mathbench_2024_gen_50a320.py
Normal file
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 0
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = True
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = False
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=math_postprocess_v2)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -11,6 +11,12 @@ zero_shot_prompts = {
|
||||
'single_choice_en': [
|
||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
||||
],
|
||||
'cloze_en': [
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
'cloze_cn': [
|
||||
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
|
||||
]
|
||||
}
|
||||
|
||||
few_shot_prompts = {
|
||||
|
52
configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py
Normal file
52
configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py
Normal file
@ -0,0 +1,52 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
|
||||
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
||||
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
# # You can write your own postprocess prompt like:
|
||||
# GSM8K_NAVIE_PROMPT_TEMPLATE = """
|
||||
# There is a detailed explanation of the final answer you should extract:
|
||||
# 1. ...
|
||||
# 2. ...
|
||||
# ...
|
||||
# """
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
model_postprocessor=dict(
|
||||
type=navie_model_postprocess,
|
||||
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
|
||||
model_name='',
|
||||
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
||||
)
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
type=GSM8KDataset,
|
||||
path='opencompass/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg,
|
||||
)
|
||||
]
|
141
configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py
Normal file
141
configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py
Normal file
@ -0,0 +1,141 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import MMLUDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
||||
from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
||||
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
||||
|
||||
mmlu_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
mmlu_all_sets = [
|
||||
'college_biology',
|
||||
'college_chemistry',
|
||||
'college_computer_science',
|
||||
'college_mathematics',
|
||||
'college_physics',
|
||||
'electrical_engineering',
|
||||
'astronomy',
|
||||
'anatomy',
|
||||
'abstract_algebra',
|
||||
'machine_learning',
|
||||
'clinical_knowledge',
|
||||
'global_facts',
|
||||
'management',
|
||||
'nutrition',
|
||||
'marketing',
|
||||
'professional_accounting',
|
||||
'high_school_geography',
|
||||
'international_law',
|
||||
'moral_scenarios',
|
||||
'computer_security',
|
||||
'high_school_microeconomics',
|
||||
'professional_law',
|
||||
'medical_genetics',
|
||||
'professional_psychology',
|
||||
'jurisprudence',
|
||||
'world_religions',
|
||||
'philosophy',
|
||||
'virology',
|
||||
'high_school_chemistry',
|
||||
'public_relations',
|
||||
'high_school_macroeconomics',
|
||||
'human_sexuality',
|
||||
'elementary_mathematics',
|
||||
'high_school_physics',
|
||||
'high_school_computer_science',
|
||||
'high_school_european_history',
|
||||
'business_ethics',
|
||||
'moral_disputes',
|
||||
'high_school_statistics',
|
||||
'miscellaneous',
|
||||
'formal_logic',
|
||||
'high_school_government_and_politics',
|
||||
'prehistory',
|
||||
'security_studies',
|
||||
'high_school_biology',
|
||||
'logical_fallacies',
|
||||
'high_school_world_history',
|
||||
'professional_medicine',
|
||||
'high_school_mathematics',
|
||||
'college_medicine',
|
||||
'high_school_us_history',
|
||||
'sociology',
|
||||
'econometrics',
|
||||
'high_school_psychology',
|
||||
'human_aging',
|
||||
'us_foreign_policy',
|
||||
'conceptual_physics',
|
||||
]
|
||||
|
||||
mmlu_datasets = []
|
||||
for _name in mmlu_all_sets:
|
||||
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
||||
mmlu_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# # You can write your own postprocess prompt like:
|
||||
# MMLU_NAVIE_PROMPT_TEMPLATE = """
|
||||
# There is a detailed explanation of the final answer you should extract:
|
||||
# 1. ...
|
||||
# 2. ...
|
||||
# ...
|
||||
# """
|
||||
|
||||
mmlu_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
model_postprocessor=dict(
|
||||
type=navie_model_postprocess,
|
||||
custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE,
|
||||
model_name='',
|
||||
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
||||
)
|
||||
|
||||
|
||||
mmlu_datasets.append(
|
||||
dict(
|
||||
abbr=f'lukaemon_mmlu_{_name}',
|
||||
type=MMLUDataset,
|
||||
path='opencompass/mmlu',
|
||||
name=_name,
|
||||
reader_cfg=mmlu_reader_cfg,
|
||||
infer_cfg=mmlu_infer_cfg,
|
||||
eval_cfg=mmlu_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
56
configs/datasets/wikibench/wikibench_gen_0978ad.py
Normal file
56
configs/datasets/wikibench/wikibench_gen_0978ad.py
Normal file
@ -0,0 +1,56 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import WikiBenchDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
|
||||
}
|
||||
|
||||
wikibench_sets = {
|
||||
'wiki': ['single_choice_cn'],
|
||||
}
|
||||
|
||||
do_circular = True
|
||||
|
||||
wikibench_datasets = []
|
||||
|
||||
for _split in list(wikibench_sets.keys()):
|
||||
for _name in wikibench_sets[_split]:
|
||||
wikibench_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
|
||||
dict(role='BOT', prompt='{answer}'),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
wikibench_eval_cfg = dict(
|
||||
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
)
|
||||
|
||||
wikibench_datasets.append(
|
||||
dict(
|
||||
type=WikiBenchDataset,
|
||||
path=f'./data/WikiBench/{_name}.jsonl',
|
||||
name='circular_' + _name if do_circular else _name,
|
||||
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
|
||||
reader_cfg=dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
),
|
||||
infer_cfg=wikibench_infer_cfg,
|
||||
eval_cfg=wikibench_eval_cfg,
|
||||
)
|
||||
)
|
20
configs/models/openai/o1_mini_2024_09_12.py
Normal file
20
configs/models/openai/o1_mini_2024_09_12.py
Normal file
@ -0,0 +1,20 @@
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='o1-mini-2024-09-12',
|
||||
type=OpenAISDK,
|
||||
path='o1-mini-2024-09-12',
|
||||
key=
|
||||
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
batch_size=1,
|
||||
temperature=1,
|
||||
max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning
|
||||
]
|
20
configs/models/openai/o1_preview_2024_09_12.py
Normal file
20
configs/models/openai/o1_preview_2024_09_12.py
Normal file
@ -0,0 +1,20 @@
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='o1-preview-2024-09-12',
|
||||
type=OpenAISDK,
|
||||
path='o1-preview-2024-09-12',
|
||||
key=
|
||||
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
batch_size=1,
|
||||
temperature=1,
|
||||
max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-14B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-32B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-72b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-72B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
12
configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py
Normal file
12
configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-7B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-14B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-72b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-72B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-7B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-14B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=2),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=2),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2_5-72b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-72B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=4),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
14
configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py
Normal file
14
configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py
Normal file
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-7B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 0
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = True
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = False
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=math_postprocess_v2)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -11,6 +11,12 @@ zero_shot_prompts = {
|
||||
'single_choice_en': [
|
||||
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
|
||||
],
|
||||
'cloze_en': [
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
'cloze_cn': [
|
||||
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
|
||||
]
|
||||
}
|
||||
|
||||
few_shot_prompts = {
|
||||
|
@ -0,0 +1,49 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
||||
from opencompass.utils import first_option_postprocess
|
||||
|
||||
gpqa_reader_cfg = dict(
|
||||
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer')
|
||||
|
||||
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
|
||||
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
|
||||
gpqa_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template={
|
||||
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template={
|
||||
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
|
||||
},
|
||||
ice_token='</E>'
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=PPLInferencer))
|
||||
|
||||
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
|
||||
|
||||
gpqa_datasets = []
|
||||
gpqa_subsets = {
|
||||
# 'extended': 'gpqa_extended.csv',
|
||||
# 'main': 'gpqa_main.csv',
|
||||
'diamond': 'gpqa_diamond.csv'
|
||||
}
|
||||
|
||||
for split in list(gpqa_subsets.keys()):
|
||||
gpqa_datasets.append(
|
||||
dict(
|
||||
abbr='GPQA_' + split,
|
||||
type=GPQADataset,
|
||||
path='./data/gpqa/',
|
||||
name=gpqa_subsets[split],
|
||||
reader_cfg=gpqa_reader_cfg,
|
||||
infer_cfg=gpqa_infer_cfg,
|
||||
eval_cfg=gpqa_eval_cfg)
|
||||
)
|
@ -0,0 +1,52 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
|
||||
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
||||
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
# # You can write your own postprocess prompt like:
|
||||
# GSM8K_NAVIE_PROMPT_TEMPLATE = """
|
||||
# There is a detailed explanation of the final answer you should extract:
|
||||
# 1. ...
|
||||
# 2. ...
|
||||
# ...
|
||||
# """
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
model_postprocessor=dict(
|
||||
type=navie_model_postprocess,
|
||||
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
|
||||
model_name='',
|
||||
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
||||
)
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
type=GSM8KDataset,
|
||||
path='opencompass/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,141 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import MMLUDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
||||
from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
||||
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
||||
|
||||
mmlu_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
mmlu_all_sets = [
|
||||
'college_biology',
|
||||
'college_chemistry',
|
||||
'college_computer_science',
|
||||
'college_mathematics',
|
||||
'college_physics',
|
||||
'electrical_engineering',
|
||||
'astronomy',
|
||||
'anatomy',
|
||||
'abstract_algebra',
|
||||
'machine_learning',
|
||||
'clinical_knowledge',
|
||||
'global_facts',
|
||||
'management',
|
||||
'nutrition',
|
||||
'marketing',
|
||||
'professional_accounting',
|
||||
'high_school_geography',
|
||||
'international_law',
|
||||
'moral_scenarios',
|
||||
'computer_security',
|
||||
'high_school_microeconomics',
|
||||
'professional_law',
|
||||
'medical_genetics',
|
||||
'professional_psychology',
|
||||
'jurisprudence',
|
||||
'world_religions',
|
||||
'philosophy',
|
||||
'virology',
|
||||
'high_school_chemistry',
|
||||
'public_relations',
|
||||
'high_school_macroeconomics',
|
||||
'human_sexuality',
|
||||
'elementary_mathematics',
|
||||
'high_school_physics',
|
||||
'high_school_computer_science',
|
||||
'high_school_european_history',
|
||||
'business_ethics',
|
||||
'moral_disputes',
|
||||
'high_school_statistics',
|
||||
'miscellaneous',
|
||||
'formal_logic',
|
||||
'high_school_government_and_politics',
|
||||
'prehistory',
|
||||
'security_studies',
|
||||
'high_school_biology',
|
||||
'logical_fallacies',
|
||||
'high_school_world_history',
|
||||
'professional_medicine',
|
||||
'high_school_mathematics',
|
||||
'college_medicine',
|
||||
'high_school_us_history',
|
||||
'sociology',
|
||||
'econometrics',
|
||||
'high_school_psychology',
|
||||
'human_aging',
|
||||
'us_foreign_policy',
|
||||
'conceptual_physics',
|
||||
]
|
||||
|
||||
mmlu_datasets = []
|
||||
for _name in mmlu_all_sets:
|
||||
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
||||
mmlu_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# # You can write your own postprocess prompt like:
|
||||
# MMLU_NAVIE_PROMPT_TEMPLATE = """
|
||||
# There is a detailed explanation of the final answer you should extract:
|
||||
# 1. ...
|
||||
# 2. ...
|
||||
# ...
|
||||
# """
|
||||
|
||||
mmlu_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
model_postprocessor=dict(
|
||||
type=navie_model_postprocess,
|
||||
custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE,
|
||||
model_name='',
|
||||
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
||||
)
|
||||
|
||||
|
||||
mmlu_datasets.append(
|
||||
dict(
|
||||
abbr=f'lukaemon_mmlu_{_name}',
|
||||
type=MMLUDataset,
|
||||
path='opencompass/mmlu',
|
||||
name=_name,
|
||||
reader_cfg=mmlu_reader_cfg,
|
||||
infer_cfg=mmlu_infer_cfg,
|
||||
eval_cfg=mmlu_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
@ -0,0 +1,47 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
|
||||
|
||||
with read_base():
|
||||
from .mmlu_pro_categories import categories
|
||||
|
||||
mmlu_pro_datasets = []
|
||||
|
||||
for category in categories:
|
||||
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
|
||||
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
|
||||
mmlu_pro_reader_cfg = dict(
|
||||
input_columns=['question', 'cot_content', 'options_str'],
|
||||
output_column='answer_string',
|
||||
train_split='validation',
|
||||
test_split='test',
|
||||
)
|
||||
mmlu_pro_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{question_and_options}\nAnswer: {{answer}}'),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
|
||||
ice_token='</E>'
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=100)
|
||||
)
|
||||
|
||||
mmlu_pro_eval_cfg = dict(
|
||||
evaluator=dict(type=MMLUProBaseEvaluator)
|
||||
)
|
||||
|
||||
mmlu_pro_datasets.append(
|
||||
dict(
|
||||
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
|
||||
type=MMLUProDataset,
|
||||
path='opencompass/mmlu_pro',
|
||||
category=category,
|
||||
reader_cfg=mmlu_pro_reader_cfg,
|
||||
infer_cfg=mmlu_pro_infer_cfg,
|
||||
eval_cfg=mmlu_pro_eval_cfg,
|
||||
))
|
@ -0,0 +1,56 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import WikiBenchDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
|
||||
}
|
||||
|
||||
wikibench_sets = {
|
||||
'wiki': ['single_choice_cn'],
|
||||
}
|
||||
|
||||
do_circular = True
|
||||
|
||||
wikibench_datasets = []
|
||||
|
||||
for _split in list(wikibench_sets.keys()):
|
||||
for _name in wikibench_sets[_split]:
|
||||
wikibench_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
|
||||
dict(role='BOT', prompt='{answer}'),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
wikibench_eval_cfg = dict(
|
||||
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
)
|
||||
|
||||
wikibench_datasets.append(
|
||||
dict(
|
||||
type=WikiBenchDataset,
|
||||
path=f'./data/WikiBench/{_name}.jsonl',
|
||||
name='circular_' + _name if do_circular else _name,
|
||||
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
|
||||
reader_cfg=dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
),
|
||||
infer_cfg=wikibench_infer_cfg,
|
||||
eval_cfg=wikibench_eval_cfg,
|
||||
)
|
||||
)
|
20
opencompass/configs/models/openai/o1_mini_2024_09_12.py
Normal file
20
opencompass/configs/models/openai/o1_mini_2024_09_12.py
Normal file
@ -0,0 +1,20 @@
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='o1-mini-2024-09-12',
|
||||
type=OpenAISDK,
|
||||
path='o1-mini-2024-09-12',
|
||||
key=
|
||||
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
batch_size=1,
|
||||
temperature=1,
|
||||
max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning
|
||||
]
|
20
opencompass/configs/models/openai/o1_preview_2024_09_12.py
Normal file
20
opencompass/configs/models/openai/o1_preview_2024_09_12.py
Normal file
@ -0,0 +1,20 @@
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='o1-preview-2024-09-12',
|
||||
type=OpenAISDK,
|
||||
path='o1-preview-2024-09-12',
|
||||
key=
|
||||
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
batch_size=1,
|
||||
temperature=1,
|
||||
max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning
|
||||
]
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-14B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-32B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
12
opencompass/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py
Normal file
12
opencompass/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-72b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-72B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
12
opencompass/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py
Normal file
12
opencompass/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py
Normal file
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-hf',
|
||||
path='Qwen/Qwen2.5-7B-Instruc',
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-14B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-72b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-72B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-turbomind',
|
||||
path='Qwen/Qwen2.5-7B-Instruct',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-0.5b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-0.5B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-14b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-14B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=2),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-1.5b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-1.5B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-32b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-32B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=2),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-3b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-3B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2_5-72b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-72B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=4),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import VLLMwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=VLLMwithChatTemplate,
|
||||
abbr='qwen2.5-7b-instruct-vllm',
|
||||
path='Qwen/Qwen2.5-7B-Instruct',
|
||||
model_kwargs=dict(tensor_parallel_size=1),
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
generation_kwargs=dict(temperature=0),
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -3,19 +3,26 @@
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from opencompass.openicl import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
|
||||
|
||||
def _parse(item):
|
||||
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
|
||||
|
||||
s = ''
|
||||
item['answer_string'] = ''
|
||||
for i, opt in enumerate(item['options']):
|
||||
if opt == 'N/A':
|
||||
continue
|
||||
s += '{}. {}\n'.format(choices[i], opt)
|
||||
option = '{}. {}\n'.format(CHOICES[i], opt)
|
||||
s += option
|
||||
if item['answer'] == CHOICES[i]:
|
||||
item['answer_string'] = option
|
||||
|
||||
item['options_str'] = s.strip()
|
||||
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
|
||||
return item
|
||||
@ -31,3 +38,38 @@ class MMLUProDataset(BaseDataset):
|
||||
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
|
||||
mmlu_pro = mmlu_pro.map(_parse)
|
||||
return mmlu_pro
|
||||
|
||||
class MMLUProBaseEvaluator(BaseEvaluator):
|
||||
|
||||
def is_equal(self, pred, refer):
|
||||
try:
|
||||
refer_option, refer_string = refer.split('. ')
|
||||
if pred in CHOICES and refer_option == pred:
|
||||
return True
|
||||
elif refer_string.strip() == pred:
|
||||
return True
|
||||
else :
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def score(self, predictions, references):
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error': 'predictions and references have different '
|
||||
'length'
|
||||
}
|
||||
correct = 0
|
||||
count = 0
|
||||
details = []
|
||||
for i, j in zip(predictions, references):
|
||||
i = i.split('\n')[0].strip()
|
||||
detail = {'pred': i, 'answer': j, 'correct': False}
|
||||
count += 1
|
||||
if self.is_equal(i, j):
|
||||
correct += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
result = {'accuracy': 100 * correct / count, 'details': details}
|
||||
return result
|
||||
|
@ -43,7 +43,8 @@ class BaseAPIModel(BaseModel):
|
||||
retry: int = 2,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
generation_kwargs: Dict = dict()):
|
||||
generation_kwargs: Dict = dict(),
|
||||
verbose: bool = False):
|
||||
self.path = path
|
||||
self.max_seq_len = max_seq_len
|
||||
self.meta_template = meta_template
|
||||
@ -53,6 +54,7 @@ class BaseAPIModel(BaseModel):
|
||||
self.template_parser = APITemplateParser(meta_template)
|
||||
self.logger = get_logger()
|
||||
self.generation_kwargs = generation_kwargs
|
||||
self.verbose = verbose
|
||||
|
||||
@abstractmethod
|
||||
def generate(self, inputs: List[PromptType],
|
||||
@ -281,6 +283,9 @@ class APITemplateParser:
|
||||
new_prompt.append(item)
|
||||
prompt = new_prompt
|
||||
|
||||
if self.meta_template.get('begin', None):
|
||||
prompt.insert(0, self.meta_template['begin'])
|
||||
|
||||
else:
|
||||
# in case the model does not have any meta template
|
||||
prompt = ''
|
||||
|
@ -20,6 +20,13 @@ OPENAI_API_BASE = os.path.join(
|
||||
os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1/'),
|
||||
'chat/completions')
|
||||
|
||||
O1_MODEL_LIST = [
|
||||
'o1-preview-2024-09-12',
|
||||
'o1-mini-2024-09-12',
|
||||
'o1-preview',
|
||||
'o1-mini',
|
||||
]
|
||||
|
||||
|
||||
@MODELS.register_module()
|
||||
class OpenAI(BaseAPIModel):
|
||||
@ -82,14 +89,17 @@ class OpenAI(BaseAPIModel):
|
||||
top_logprobs: Optional[int] = None,
|
||||
temperature: Optional[float] = None,
|
||||
tokenizer_path: Optional[str] = None,
|
||||
extra_body: Optional[Dict] = None):
|
||||
extra_body: Optional[Dict] = None,
|
||||
max_completion_tokens: int = 16384,
|
||||
verbose: bool = False):
|
||||
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template,
|
||||
query_per_second=query_per_second,
|
||||
rpm_verbose=rpm_verbose,
|
||||
retry=retry)
|
||||
retry=retry,
|
||||
verbose=verbose)
|
||||
import tiktoken
|
||||
self.tiktoken = tiktoken
|
||||
self.temperature = temperature
|
||||
@ -131,6 +141,9 @@ class OpenAI(BaseAPIModel):
|
||||
self.proxy_url = openai_proxy_url
|
||||
|
||||
self.path = path
|
||||
self.max_completion_tokens = max_completion_tokens
|
||||
self.logger.warning(
|
||||
f'Max Completion tokens for {path} is :{max_completion_tokens}')
|
||||
|
||||
def generate(self,
|
||||
inputs: List[PromptType],
|
||||
@ -255,16 +268,33 @@ class OpenAI(BaseAPIModel):
|
||||
header['OpenAI-Organization'] = self.orgs[self.org_ctr]
|
||||
|
||||
try:
|
||||
data = dict(
|
||||
model=self.path,
|
||||
messages=messages,
|
||||
max_tokens=max_out_len,
|
||||
n=1,
|
||||
logprobs=self.logprobs,
|
||||
top_logprobs=self.top_logprobs,
|
||||
stop=None,
|
||||
temperature=temperature,
|
||||
)
|
||||
if self.path in O1_MODEL_LIST:
|
||||
self.logger.warning(
|
||||
f"'max_token' is unsupported for model {self.path}")
|
||||
self.logger.warning(
|
||||
f'We use max_completion_tokens:'
|
||||
f'{self.max_completion_tokens}for this query')
|
||||
data = dict(
|
||||
model=self.path,
|
||||
messages=messages,
|
||||
max_completion_tokens=self.max_completion_tokens,
|
||||
n=1,
|
||||
logprobs=self.logprobs,
|
||||
top_logprobs=self.top_logprobs,
|
||||
stop=None,
|
||||
temperature=temperature,
|
||||
)
|
||||
else:
|
||||
data = dict(
|
||||
model=self.path,
|
||||
messages=messages,
|
||||
max_tokens=max_out_len,
|
||||
n=1,
|
||||
logprobs=self.logprobs,
|
||||
top_logprobs=self.top_logprobs,
|
||||
stop=None,
|
||||
temperature=temperature,
|
||||
)
|
||||
if self.extra_body:
|
||||
data.update(self.extra_body)
|
||||
if isinstance(self.url, list):
|
||||
@ -282,7 +312,9 @@ class OpenAI(BaseAPIModel):
|
||||
'http': self.proxy_url,
|
||||
'https': self.proxy_url,
|
||||
}
|
||||
|
||||
if self.verbose:
|
||||
self.logger.debug(
|
||||
f'Start send query to {self.proxy_url}')
|
||||
raw_response = requests.post(
|
||||
url,
|
||||
headers=header,
|
||||
@ -290,6 +322,10 @@ class OpenAI(BaseAPIModel):
|
||||
proxies=proxies,
|
||||
)
|
||||
|
||||
if self.verbose:
|
||||
self.logger.debug(
|
||||
f'Get response from {self.proxy_url}')
|
||||
|
||||
except requests.ConnectionError:
|
||||
self.logger.error('Got connection error, retrying...')
|
||||
continue
|
||||
@ -343,27 +379,44 @@ class OpenAI(BaseAPIModel):
|
||||
"""
|
||||
assert self.tokenizer_path or self.path
|
||||
try:
|
||||
if self.verbose:
|
||||
self.logger.info(f'Used tokenizer_path: {self.tokenizer_path}')
|
||||
tokenizer_path = self.tokenizer_path if self.tokenizer_path \
|
||||
else self.path
|
||||
try:
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
f'Start load tiktoken encoding: {tokenizer_path}')
|
||||
enc = self.tiktoken.encoding_for_model(tokenizer_path)
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
f'Successfully tiktoken encoding: {tokenizer_path}')
|
||||
return len(enc.encode(prompt))
|
||||
except Exception as e:
|
||||
self.logger.warn(f'{e}, tiktoken encoding cannot load '
|
||||
f'{tokenizer_path}')
|
||||
from transformers import AutoTokenizer
|
||||
if self.hf_tokenizer is None:
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
f'Start load hf tokenizer: {tokenizer_path}')
|
||||
self.hf_tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_path, trust_remote_code=True)
|
||||
self.logger.info(
|
||||
f'Tokenizer is loaded from {tokenizer_path}')
|
||||
f'Successfully load HF Tokenizer from {tokenizer_path}'
|
||||
)
|
||||
return len(self.hf_tokenizer(prompt).input_ids)
|
||||
except Exception:
|
||||
self.logger.warn(
|
||||
'Can not get tokenizer automatically, '
|
||||
'will use default tokenizer gpt-4 for length calculation.')
|
||||
default_tokenizer = 'gpt-4'
|
||||
|
||||
enc = self.tiktoken.encoding_for_model(default_tokenizer)
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
f'Successfully load default tiktoken tokenizer: '
|
||||
f' {default_tokenizer}')
|
||||
return len(enc.encode(prompt))
|
||||
|
||||
def bin_trim(self, prompt: str, num_token: int) -> str:
|
||||
@ -429,11 +482,27 @@ class OpenAISDK(OpenAI):
|
||||
top_logprobs: int | None = None,
|
||||
temperature: float | None = None,
|
||||
tokenizer_path: str | None = None,
|
||||
extra_body: Dict | None = None):
|
||||
super().__init__(path, max_seq_len, query_per_second, rpm_verbose,
|
||||
retry, key, org, meta_template, openai_api_base,
|
||||
openai_proxy_url, mode, logprobs, top_logprobs,
|
||||
temperature, tokenizer_path, extra_body)
|
||||
extra_body: Dict | None = None,
|
||||
max_completion_tokens: int = 16384,
|
||||
verbose: bool = False):
|
||||
super().__init__(path,
|
||||
max_seq_len,
|
||||
query_per_second,
|
||||
rpm_verbose,
|
||||
retry,
|
||||
key,
|
||||
org,
|
||||
meta_template,
|
||||
openai_api_base,
|
||||
openai_proxy_url,
|
||||
mode,
|
||||
logprobs,
|
||||
top_logprobs,
|
||||
temperature,
|
||||
tokenizer_path,
|
||||
extra_body,
|
||||
verbose=verbose,
|
||||
max_completion_tokens=max_completion_tokens)
|
||||
from openai import OpenAI
|
||||
|
||||
if self.proxy_url is None:
|
||||
@ -448,6 +517,8 @@ class OpenAISDK(OpenAI):
|
||||
base_url=openai_api_base,
|
||||
api_key=key,
|
||||
http_client=httpx.Client(proxies=proxies))
|
||||
if self.verbose:
|
||||
self.logger.info(f'Used openai_client: {self.openai_client}')
|
||||
|
||||
def _generate(self, input: PromptList | str, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
@ -497,8 +568,23 @@ class OpenAISDK(OpenAI):
|
||||
num_retries = 0
|
||||
while num_retries < self.retry:
|
||||
self.wait()
|
||||
try:
|
||||
responses = self.openai_client.chat.completions.create(
|
||||
|
||||
if self.path in O1_MODEL_LIST:
|
||||
self.logger.warning(
|
||||
f"'max_token' is unsupported for model {self.path}")
|
||||
self.logger.warning(
|
||||
f'We use max_completion_tokens:'
|
||||
f'{self.max_completion_tokens}for this query')
|
||||
query_data = dict(
|
||||
model=self.path,
|
||||
max_completion_tokens=self.max_completion_tokens,
|
||||
n=1,
|
||||
temperature=self.temperature,
|
||||
messages=messages,
|
||||
extra_body=self.extra_body,
|
||||
)
|
||||
else:
|
||||
query_data = dict(
|
||||
model=self.path,
|
||||
max_tokens=max_out_len,
|
||||
n=1,
|
||||
@ -506,6 +592,15 @@ class OpenAISDK(OpenAI):
|
||||
messages=messages,
|
||||
extra_body=self.extra_body,
|
||||
)
|
||||
|
||||
try:
|
||||
if self.verbose:
|
||||
self.logger.info('Start calling OpenAI API')
|
||||
responses = self.openai_client.chat.completions.create(
|
||||
**query_data)
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
'Successfully get response from OpenAI API')
|
||||
return responses.choices[0].message.content
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
|
@ -127,6 +127,7 @@ class GenInferencer(BaseInferencer):
|
||||
index = len(tmp_result_dict)
|
||||
|
||||
# 4. Wrap prompts with Dataloader
|
||||
logger.info('Starting build dataloader')
|
||||
dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
|
||||
|
||||
# 5. Inference for prompts in each batch
|
||||
|
@ -6,12 +6,66 @@ from tqdm import tqdm
|
||||
|
||||
from opencompass.registry import TEXT_POSTPROCESSORS
|
||||
|
||||
from .postprocessors.naive import NaiveExtractor, format_input_naive
|
||||
from .postprocessors.xfinder.extractor import Extractor
|
||||
from .postprocessors.xfinder.xfinder_utils import (DataProcessor,
|
||||
convert_to_xfinder_format)
|
||||
|
||||
|
||||
def gen_output(ori_data, extractor):
|
||||
def gen_output_naive(ori_data, extractor):
|
||||
extracted_answers = []
|
||||
for item in tqdm(ori_data):
|
||||
user_input = extractor.prepare_input(item)
|
||||
extracted_answer = extractor.gen_output(user_input)
|
||||
item['extracted_answer'] = extracted_answer
|
||||
extracted_answers.append(extracted_answer)
|
||||
|
||||
return extracted_answers
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('naive')
|
||||
def navie_model_postprocess(preds: list, model_name: str,
|
||||
custom_instruction: str, api_url: Union[str, list],
|
||||
**kwargs) -> list:
|
||||
"""Postprocess the text extracted by custom model.
|
||||
Args:
|
||||
preds (list): The question, reference answer and model prediction.
|
||||
model_name (str): The name of the model.
|
||||
custom_instruction (str): Custom instruction for the dataset.
|
||||
url (Union[str, list]): The api url of the model.
|
||||
|
||||
Returns:
|
||||
list: The postprocessed answers.
|
||||
"""
|
||||
|
||||
def _eval_pred(texts, extractor, num_processes=8):
|
||||
ori_data = texts
|
||||
extracted_answers = []
|
||||
batched_ori_data = []
|
||||
# Split data into batches
|
||||
num_processes = min(num_processes, len(ori_data))
|
||||
batch_size = len(ori_data) // num_processes
|
||||
for i in range(0, len(ori_data), batch_size):
|
||||
batched_ori_data.append(ori_data[i:i + batch_size])
|
||||
with Pool(num_processes) as p:
|
||||
results = p.map(partial(gen_output_naive, extractor=extractor),
|
||||
batched_ori_data)
|
||||
for result in results:
|
||||
extracted_answers.extend(result)
|
||||
return extracted_answers
|
||||
|
||||
format_data = format_input_naive(preds)
|
||||
assert api_url is not None, 'Please provide the api url.'
|
||||
extractor = NaiveExtractor(
|
||||
model_name=model_name,
|
||||
custom_instruction=custom_instruction,
|
||||
url=api_url.split(',') if ',' in api_url else api_url)
|
||||
calc_acc_func = partial(_eval_pred, extractor=extractor)
|
||||
extracted_answers = calc_acc_func(format_data)
|
||||
return extracted_answers
|
||||
|
||||
|
||||
def gen_output_xfinder(ori_data, extractor):
|
||||
ext_cor_pairs = []
|
||||
extracted_data = []
|
||||
extracted_answers = []
|
||||
@ -30,9 +84,8 @@ def gen_output(ori_data, extractor):
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('xfinder')
|
||||
def xfinder_postprocess(preds: list, question_type: str,
|
||||
xfinder_model_name: str,
|
||||
xfiner_api_url: Union[str, list], **kwargs) -> list:
|
||||
def xfinder_postprocess(preds: list, question_type: str, model_name: str,
|
||||
api_url: Union[str, list], **kwargs) -> list:
|
||||
"""Postprocess the text extracted by xFinder model.
|
||||
Args:
|
||||
preds (list): The question, reference answer and model prediction.
|
||||
@ -56,7 +109,7 @@ def xfinder_postprocess(preds: list, question_type: str,
|
||||
for i in range(0, len(ori_data), batch_size):
|
||||
batched_ori_data.append(ori_data[i:i + batch_size])
|
||||
with Pool(num_processes) as p:
|
||||
results = p.map(partial(gen_output, extractor=extractor),
|
||||
results = p.map(partial(gen_output_xfinder, extractor=extractor),
|
||||
batched_ori_data)
|
||||
for result in results:
|
||||
extracted_answers += result[0]
|
||||
@ -65,11 +118,11 @@ def xfinder_postprocess(preds: list, question_type: str,
|
||||
return extracted_answers
|
||||
|
||||
format_data = convert_to_xfinder_format(question_type, preds)
|
||||
assert xfiner_api_url is not None, 'Please provide the api url.'
|
||||
assert api_url is not None, 'Please provide the api url.'
|
||||
data_processor = DataProcessor()
|
||||
extractor = Extractor(model_name=xfinder_model_name,
|
||||
url=xfiner_api_url.split(',')
|
||||
if ',' in xfiner_api_url else xfiner_api_url)
|
||||
extractor = Extractor(
|
||||
model_name=model_name,
|
||||
url=api_url.split(',') if ',' in api_url else api_url)
|
||||
calc_acc_func = partial(_eval_pred,
|
||||
data_processor=data_processor,
|
||||
extractor=extractor)
|
||||
|
11
opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
Normal file
11
opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
Normal file
@ -0,0 +1,11 @@
|
||||
OPTION_NAVIE_PROMPT_TEMPLATE = """
|
||||
There is a detailed explanation of the final answer you should extract:
|
||||
1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
|
||||
2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
|
||||
""" # noqa
|
||||
|
||||
MATH_NAVIE_PROMPT_TEMPLATE = """
|
||||
This is a detailed explanation of the final answer you should extract:
|
||||
1. The question type is a math question, so the final answer should be a number, set, vector, matrix, interval, expression, function, equation, or inequality and any combination of them.
|
||||
2. If the final answer includes additional symbols, such as units, you should exclude them and only extract the pure final answer.
|
||||
""" # noqa
|
71
opencompass/utils/postprocessors/naive/README.md
Normal file
71
opencompass/utils/postprocessors/naive/README.md
Normal file
@ -0,0 +1,71 @@
|
||||
## Short Usage Introduction for Naive Model Postprocessor with Custom Model
|
||||
|
||||
<!-- Now OC can use -->
|
||||
|
||||
### Step 1: Deploy an API server using vLLM or LMDeploy
|
||||
|
||||
```bash
|
||||
lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name llama3-8b-instruct --server-port 23333 --backend turbomind --tp 1
|
||||
```
|
||||
|
||||
### Step 2: Add Naive Model Postprocessor to the configuration file
|
||||
|
||||
Take GSM8K as an example, you can add the following lines to the configuration file and replace the `api_url` with the correct address of the API server.
|
||||
|
||||
```python
|
||||
...
|
||||
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
||||
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
...
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
# Add the following line to use the naive model postprocessor
|
||||
model_postprocessor=dict(
|
||||
type=navie_model_postprocess,
|
||||
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
|
||||
model_name='llama3-8b-instruct',
|
||||
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
||||
)
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
The prompt for extraction can also be customized by changing the `custom_instruction` parameter. Now support two default templates: `MATH_NAVIE_PROMPT_TEMPLATE` for math problems extraction like GSM8K and MATH, and `OPTION_NAVIE_PROMPT_TEMPLATE` for option problems extraction like MMLU. You can also write your own prompt template, like:
|
||||
|
||||
```python
|
||||
OPTION_NAVIE_PROMPT_TEMPLATE = """
|
||||
There is a detailed explanation of the final answer you should extract:
|
||||
1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
|
||||
2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
|
||||
"""
|
||||
```
|
||||
|
||||
Your prompt should start with `There is a detailed explanation of the final answer you should extract:` and following with your customized instructions.
|
||||
|
||||
### Step 3: Run the Evaluation as Usual
|
||||
|
||||
Now you can run the evaluation as usual with the configuration file you modified. The evaluation will use the custom model as the post-process model to get the final result. The final result will be the `model_postprocess_accuracy` in the evaluation result, like:
|
||||
|
||||
```Markdown
|
||||
dataset version metric mode llama-3-8b-instruct-turbomind
|
||||
------------------------------------------------- --------- -------------------------- ------ -------------------------------
|
||||
gsm8k a58960 accuracy gen 73.46
|
||||
gsm8k a58960 model_postprocess_accuracy gen 78.77
|
||||
```
|
||||
|
||||
## Experiment Results
|
||||
|
||||
We have tested the model postprocess method with different models (Qwen2-72B-Chat, Llama3-8b-Chat) as post-process model on the GSM8K, MMLU datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
|
||||
|
||||
```Markdown
|
||||
| Dataset | Type | Config ID | Regex Postprocess Score | Model Postprocess Score (Llama3-8b-Instruct) | Model Postprocess Score (Qwen2-72B-Chat) |
|
||||
| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |----------------------- |
|
||||
| gsm8k | math | a58960 | 73.46 | 79.08 | 78.77 |
|
||||
| mmlu | option | 4d595a | 67.89 | 65.26 | 67.94 |
|
||||
```
|
||||
|
||||
The `metric` column with `model_postprocess_accuracy` is the final result after the `Naive Model Postprocessor` is applied.
|
2
opencompass/utils/postprocessors/naive/__init__.py
Normal file
2
opencompass/utils/postprocessors/naive/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .extractor import * # noqa
|
||||
from .PROMPT_TEMPLATE import * # noqa
|
121
opencompass/utils/postprocessors/naive/extractor.py
Normal file
121
opencompass/utils/postprocessors/naive/extractor.py
Normal file
@ -0,0 +1,121 @@
|
||||
# Naive model extractor for OpenCompass, modified from xFinder: https://github.com/IAAR-Shanghai/xFinder # noqa
|
||||
import json
|
||||
import time
|
||||
from logging import getLogger
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
Meta_Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
|
||||
First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
|
||||
Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
|
||||
Below are some special cases you need to be aware of:
|
||||
(1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
|
||||
(2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
|
||||
(3) You should only return the precise answer you extract, without processing the answer. Please return only the answer and do not add any additional content.
|
||||
|
||||
""" # noqa
|
||||
|
||||
|
||||
def format_input_naive(data):
|
||||
format_data = []
|
||||
for item in data:
|
||||
template = {}
|
||||
question = item['origin_prompt'][-1]['prompt']
|
||||
llm_output = item['prediction']
|
||||
correct_answer = item['reference'] if item['reference'] else item[
|
||||
'gold']
|
||||
template['correct_answer'] = correct_answer
|
||||
template['question'] = question
|
||||
template['llm_output'] = llm_output
|
||||
|
||||
format_data.append(template)
|
||||
return format_data
|
||||
|
||||
|
||||
class NaiveExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name,
|
||||
model_path=None,
|
||||
url=None,
|
||||
temperature=0,
|
||||
max_tokens=3000,
|
||||
api_key='EMPTY',
|
||||
SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.', # noqa
|
||||
custom_instruction=''):
|
||||
self.model_name = model_name
|
||||
self.SYSTEM = SYSTEM
|
||||
self.model_path = model_path
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
self.custom_instruction = custom_instruction
|
||||
self.logger = getLogger(__name__)
|
||||
|
||||
def prepare_input(self, item):
|
||||
user_input = Meta_Instruction + self.custom_instruction + \
|
||||
"Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
|
||||
"Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
|
||||
'Key extracted answer: '
|
||||
|
||||
return user_input
|
||||
|
||||
def gen_output(self, query):
|
||||
return self.openai_infer(query)
|
||||
|
||||
def openai_infer(self, query: str, retry=9) -> str:
|
||||
"""Perform inference on the OpenAI model.
|
||||
|
||||
Args:
|
||||
query (str): The input query.
|
||||
|
||||
Returns:
|
||||
str: The extracted answer (xFinder's output).
|
||||
"""
|
||||
if isinstance(self.url, list):
|
||||
# Randomly api for better load balancing
|
||||
import random
|
||||
self.url = random.choice(self.url)
|
||||
self.client = OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.url,
|
||||
)
|
||||
self.retry = retry
|
||||
|
||||
t = time.time()
|
||||
retry = self.retry
|
||||
response = ''
|
||||
while retry > 0:
|
||||
try:
|
||||
chat_response = self.client.chat.completions.create(
|
||||
model=self.client.models.list().data[0].id
|
||||
if self.model_name == '' else self.model_name,
|
||||
messages=[
|
||||
{
|
||||
'role': 'system',
|
||||
'content': self.SYSTEM
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': query
|
||||
},
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
js_response = json.loads(chat_response.model_dump_json())
|
||||
response = js_response['choices'][0]['message']['content']
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.info(f'Error: {e}')
|
||||
self.logger.info(f'{self.url} is down. Retrying...')
|
||||
self.logger.info(f'Time elapsed: {time.time() - t} seconds')
|
||||
time.sleep(6)
|
||||
retry -= 1
|
||||
if retry == 0:
|
||||
response = 'Error: Failed to get response.'
|
||||
self.logger.info(f'{response} after {self.retry} tries.')
|
||||
raise ValueError('The api is down')
|
||||
return response.strip()
|
Loading…
Reference in New Issue
Block a user