[Fix] Compatible with old versions (#1616)

* fix pip version

* fix pip version

* Compatible with old versions

* compati old version

* compati old version

* compati old version

* update configs
This commit is contained in:
bittersweet1999 2024-10-21 10:16:29 +08:00 committed by GitHub
parent 6e8adf5221
commit a11e2b2fd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
52 changed files with 2337 additions and 54 deletions

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .compassbench_v1_3_objective_gen_068af0 import compassbench_aug_datasets # noqa: F401, F403

View File

@ -0,0 +1,74 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3, compassbench_objective_v1_3_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
prompt_cn = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
'cloze_cn': '以下是一道填空题,请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
}
prompt_en = {
'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
}
douknow_sets = {
'knowledge': ['single_choice_cn'],
'math': ['single_choice_cn'],
}
# Set up the prompts
CircularEval = True
compassbench_aug_datasets = []
for _split in list(douknow_sets.keys()):
for _name in douknow_sets[_split]:
if 'cn' in _name:
single_choice_prompts = prompt_cn
cloze_prompts = prompt_cn
else:
single_choice_prompts = prompt_en
cloze_prompts = prompt_en
douknow_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt= single_choice_prompts[_name],
),
dict(role='BOT', prompt='{answer}'),] if 'choice' in _name else cloze_prompts[_name],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
douknow_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if CircularEval else AccEvaluator) if 'single_choice' in _name else dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD' ) if 'single_choice' in _name else dict(type=compassbench_objective_v1_3_postprocess, name=_name))
compassbench_aug_datasets.append(
dict(
type=CompassBenchObjectiveV1_3,
path=f'./data/compassbench_v1_3/{_split}/{_name}.jsonl',
name='circular_' + _name if CircularEval else _name,
abbr='compassbench-' + _split + '-' + _name + 'circular'if CircularEval else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer'
),
infer_cfg=douknow_infer_cfg,
eval_cfg=douknow_eval_cfg,
))
del _split, _name

View File

@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import GPQADataset, GPQAEvaluator
from opencompass.utils import first_option_postprocess
gpqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer')
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
gpqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
},
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer))
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
gpqa_datasets = []
gpqa_subsets = {
# 'extended': 'gpqa_extended.csv',
# 'main': 'gpqa_main.csv',
'diamond': 'gpqa_diamond.csv'
}
for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,
infer_cfg=gpqa_infer_cfg,
eval_cfg=gpqa_eval_cfg)
)

View File

@ -0,0 +1,47 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
with read_base():
from .mmlu_pro_categories import categories
mmlu_pro_datasets = []
for category in categories:
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer_string',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=f'{question_and_options}\nAnswer: {{answer}}'),
prompt_template=dict(
type=PromptTemplate,
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer, max_out_len=100)
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(type=MMLUProBaseEvaluator)
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench',
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
type=SubjectiveCmpDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
))

View File

@ -0,0 +1,115 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from opencompass.datasets import ArenaHardDataset
from opencompass.summarizers import ArenaHardSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
summarizer = dict(type=ArenaHardSummarizer),
))

View File

@ -0,0 +1,80 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'arenahard',
]
arenahard_datasets = []
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
gpt4 = [dict(
abbr='gpt4-0314',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = judge_prompt
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
arenahard_datasets.append(
dict(
abbr='arenahard',
type=ArenaHardDataset,
path='./data/subjective/arena_hard',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.datasets import CompassArenaDataset
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,130 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
base_prompt_cn = """
我希望你创建一个排行榜用于评估来自各种大型语言模型的回答格式的正确性为了完成这个任务你将需要分析给模型的文本提示以及它们对应的回答具体来说请确保你的评估输出正确地格式化为JSON字符串我将为此提供提示和回答
以下是提示内容
{
"instruction": "{question}",
}
以下是模型的输出结果
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式进行彻底的格式检查并提供格式正确或错误的详细解释你的反馈应包括模型的名称接着是格式正确性的状态'1'表示正确'0'表示错误将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现换句话说你应该生成以下输出
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意你的回答应是一个正确格式化的JSON字符串不应包含任何额外的内容我们将在Python中直接将其作为JSON字符串加载
"""
fofo_datasets = []
for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts'
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)
subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
))

View File

@ -0,0 +1,51 @@
# Guideline for evaluating HelloBench on Diverse LLMs
HelloBench is a comprehenvise, in-the-wild, and open-ended benchmark to evaluate LLMs' performance in generating long text. More details could be found in [🌐Github Repo](https://github.com/Quehry/HelloBench) and [📖Paper](https://arxiv.org/abs/2409.16191).
## Detailed instructions to evalute HelloBench in Opencompass
1. Git clone Opencompass
```shell
cd ~
git clone git@github.com:open-compass/opencompass.git
cd opencompass
```
2. Download HelloBench data in [Google Drive Url](https://drive.google.com/file/d/1EJTmMFgCs2pDy9l0wB5idvp3XzjYEsi9/view?usp=sharing), unzip it and put it in the following path(OPENCOMPASS_PATH/data/HelloBench), make sure you get path like this:
```
~/opencompass/data/
└── HelloBench
├── chat.jsonl
├── heuristic_text_generation.jsonl
├── length_constrained_data
│ ├── heuristic_text_generation_16k.jsonl
│ ├── heuristic_text_generation_2k.jsonl
│ ├── heuristic_text_generation_4k.jsonl
│ └── heuristic_text_generation_8k.jsonl
├── open_ended_qa.jsonl
├── summarization.jsonl
└── text_completion.jsonl
```
3. Setup your opencompass
```
cd ~/opencompass
pip install -e .
```
4. configuration your launch in configs/eval_hellobench.py
- set your models to be evaluated
- set your judge model (we recommend to use gpt4o-mini)
5. launch it!
```
python run.py configs/eval_hellobench.py
```
6. After that, you could find the results in outputs/hellobench/xxx/summary

View File

@ -0,0 +1,111 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import HelloBenchDataset, hellobench_postprocess
system_prompt = """You are a helpful evaluator. Your task is to evaluate the checklists of the responses given by the Large Language Models (LLMs) based on user instructions. These checklists consist of yes or no questions."""
user_prompt = """Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Here is the instruction:
{{\"instruction\": {instruction}}}
Here is the response given by LLM:
{{\"response\": {prediction}}}
Since the response may be rather long, I am specifically reminding you here that the response has ended.
Here are checklists of this instruction:
{{\"checklists\": {formatted_checklists}}}
To further remind you, I will repeat my requirements:
Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Always provide the reason for your evaluation results. You should be strict but fair in your evaluation. A score of 1 means that the response perfectly meets all the checklist requirements and you think there are really no room for improvements. When giving a score of 1, you need to carefully consider whether this checklist has been perfectly satisfied.
Evaluate all the checklists and return the evaluation results of the checklists. Output a Python List consisting of the Python Dictionary formatted as follows:
[{{\"checklist_id\": \"the id of the checklist\", \"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}},{{\"checklist_id\": \"the id of the checklist\",
\"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}}]
There are total {num_checklist} checklists that you need to evaluate. The length of the output list is equal to the number of checklists and you should give an evaluation score for each checklist. You shoule be very very very strict to the evalution to further compare the responses from different models. Your response must be a valid Python List and should contain nothing else, as it will be directly executed in Python."""
subjective_reader_cfg = dict(
input_columns=['instruction', 'formatted_checklists', 'num_checklist'],
output_column='judgement',
)
hellobench_categories = [
'open_ended_qa',
'summarization',
'chat',
'text_completion',
'heuristic_text_generation',
]
data_path ='data/HelloBench'
hellobench_datasets = []
for category_name in hellobench_categories:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=16384),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = user_prompt
),
]),
),
dict_postprocessor=dict(type=hellobench_postprocess,),
),
pred_role='BOT',
)
hellobench_datasets.append(
dict(
abbr=f'HelloBench-{category_name}',
type=HelloBenchDataset,
path=data_path,
category_name=category_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
from opencompass.datasets import MTBench101Dataset
from opencompass.summarizers import MTBench101Summarizer
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
))

View File

@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench101',
]
data_path ='data/subjective/'
mtbench101_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
mtbench101_datasets.append(
dict(
abbr=f'{_name}',
type=MTBench101Dataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
from opencompass.datasets import MTBenchDataset
from opencompass.summarizers import MTBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
))

View File

@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench_0.0','mtbench_0.1','mtbench_0.7'
]
data_path ='data/subjective/mtbench'
mtbench_datasets = []
for _name in subjective_all_sets:
temperature = float(_name.split('_')[1])
do_sample = False if temperature == 0.0 else True
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
mtbench_datasets.append(
dict(
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
@ -63,4 +62,5 @@ wildbench_datasets.append(
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
gpt4 = dict(
abbr='gpt4-turbo',
)
claude = dict(
abbr='HaiKu',
)
llama_2_70b = dict(
abbr='llama-2-70b-chat-hf',
)
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench',
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
type=SubjectiveCmpDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
))

View File

@ -0,0 +1,115 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from opencompass.datasets import ArenaHardDataset
from opencompass.summarizers import ArenaHardSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
summarizer = dict(type=ArenaHardSummarizer),
))

View File

@ -0,0 +1,80 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'arenahard',
]
arenahard_datasets = []
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
gpt4 = [dict(
abbr='gpt4-0314',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = judge_prompt
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
arenahard_datasets.append(
dict(
abbr='arenahard',
type=ArenaHardDataset,
path='./data/subjective/arena_hard',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.datasets import CompassArenaDataset
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,130 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
base_prompt_cn = """
我希望你创建一个排行榜用于评估来自各种大型语言模型的回答格式的正确性为了完成这个任务你将需要分析给模型的文本提示以及它们对应的回答具体来说请确保你的评估输出正确地格式化为JSON字符串我将为此提供提示和回答
以下是提示内容
{
"instruction": "{question}",
}
以下是模型的输出结果
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式进行彻底的格式检查并提供格式正确或错误的详细解释你的反馈应包括模型的名称接着是格式正确性的状态'1'表示正确'0'表示错误将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现换句话说你应该生成以下输出
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意你的回答应是一个正确格式化的JSON字符串不应包含任何额外的内容我们将在Python中直接将其作为JSON字符串加载
"""
fofo_datasets = []
for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts'
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)
subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
))

View File

@ -48,4 +48,4 @@ pip install -e .
python run.py configs/eval_hellobench.py
```
6. After that, you could find the results in outputs/hellobench/xxx/summary
6. After that, you could find the results in outputs/hellobench/xxx/summary

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
from opencompass.datasets import MTBench101Dataset
from opencompass.summarizers import MTBench101Summarizer
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
))

View File

@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench101',
]
data_path ='data/subjective/'
mtbench101_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
mtbench101_datasets.append(
dict(
abbr=f'{_name}',
type=MTBench101Dataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
from opencompass.datasets import MTBenchDataset
from opencompass.summarizers import MTBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
))

View File

@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench_0.0','mtbench_0.1','mtbench_0.7'
]
data_path ='data/subjective/mtbench'
mtbench_datasets = []
for _name in subjective_all_sets:
temperature = float(_name.split('_')[1])
do_sample = False if temperature == 0.0 else True
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
mtbench_datasets.append(
dict(
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
@ -63,4 +62,5 @@ wildbench_datasets.append(
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
gpt4 = dict(
abbr='gpt4-turbo',
)
claude = dict(
abbr='HaiKu',
)
llama_2_70b = dict(
abbr='llama-2-70b-chat-hf',
)
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
))

View File

@ -245,7 +245,6 @@ class ChatInferencer(BaseInferencer):
logger.info('Starting inference process...')
for datum in tqdm(dataloader, disable=not self.is_main_process):
chat = datum[0]
if self.infer_mode == 'last':
self.infer_last(chat, index, output_handler)
elif self.infer_mode == 'every':