[Feature] Added Bradley-Terry subjective evaluation

This commit is contained in:
Alexander Lam 2024-12-31 11:01:23 +08:00 committed by GitHub
parent 98435dd98e
commit dc6035cfcb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1230 additions and 45 deletions

View File

@ -0,0 +1,125 @@
from mmengine.config import read_base
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_bradleyterry_postprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [
dict(
abbr='gpt4-turbo',
)
]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.',
)
],
round=[
dict(role='HUMAN', prompt=gpt4_prompt),
],
),
),
dict_postprocessor=dict(
type=alpacaeval_bradleyterry_postprocess,
),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred=[
{
'abbr': 'gpt4-turbo',
'path': './data/subjective/alpaca_eval/gpt4-turbo',
}
],
)
)

View File

@ -0,0 +1,173 @@
from opencompass.datasets import (
CompassArenaDataset,
compassarena_bradleyterry_postprocess,
)
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path = 'data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = (
"""
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
language_prompt = (
"""
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
math_prompt = (
"""
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
reason_prompt = math_prompt
creation_prompt = (
"""
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
"""
+ base_prompt
)
sub_map = {
'language': language_prompt,
'knowledge': knowledge_prompt,
'reason_v2': reason_prompt,
'math_v2': math_prompt,
'creationv2_zh': creation_prompt,
}
gpt4 = [
dict(
abbr='gpt4-turbo',
)
]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=_prompt),
]
),
),
dict_postprocessor=dict(
type=compassarena_bradleyterry_postprocess,
count_ties=True,
),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred=[
{
'abbr': 'gpt4-turbo',
'path': './data/subjective/compass_arena/gpt4-turbo',
}
],
)
)

View File

@ -0,0 +1,74 @@
from opencompass.datasets import WildBenchDataset, wildbench_bradleyterry_postprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path = './data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template="""{dialogue}"""),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(type=PromptTemplate, template="""{prompt}"""),
dict_postprocessor=dict(type=wildbench_bradleyterry_postprocess),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
base_models = [
dict(
abbr='gpt4-turbo',
),
dict(
abbr='HaiKu',
),
dict(
abbr='llama-2-70b-chat-hf',
),
]
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred=[
{'abbr': 'gpt4-turbo', 'path': './data/subjective/WildBench/gpt4'},
{
'abbr': 'llama-2-70b-chat-hf',
'path': './data/subjective/WildBench/llama2-70b',
},
{'abbr': 'HaiKu', 'path': './data/subjective/WildBench/claude'},
{
'abbr': 'llama-2-70b-chat-turbomind',
'path': './data/subjective/WildBench/llama2-70b',
},
{
'abbr': 'llama-2-70b-chat-vllm',
'path': './data/subjective/WildBench/llama2-70b',
},
],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models=base_models,
)
)

View File

@ -0,0 +1,126 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
alpacav2_datasets,
)
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
compassarena_datasets,
)
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
wildbench_datasets,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct,
)
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct,
)
from opencompass.models import (
HuggingFace,
HuggingFaceCausalLM,
HuggingFaceChatGLM3,
OpenAI,
TurboMindModelwithChatTemplate,
)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import (
CompassArenaBradleyTerrySummarizer,
SubjectiveSummarizer,
)
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
*lmdeploy_internlm2_5_7b_chat,
*lmdeploy_internlm2_5_20b_chat,
*lmdeploy_qwen2_5_14b_instruct,
*lmdeploy_qwen2_5_7b_instruct,
*lmdeploy_qwen2_7b_instruct,
]
datasets = [
*alpacav2_datasets,
*compassarena_datasets,
*wildbench_datasets,
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(
type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)
),
)
## ------------- Summary Configuration
# This step fits a Bradley-Terry model (statistical model) with an option
# to include style features and control variables based on groups
# (group variables must be available in the input dataset for each observation).
summarizer = dict(
type=CompassArenaBradleyTerrySummarizer,
rating_system='bradleyterry',
num_bootstrap=100,
num_cpu=None,
with_control_vars=True,
normalize_style_features=False,
odds_ratio=True,
)
work_dir = 'outputs/subjective/bradleyterry'

View File

@ -0,0 +1,125 @@
from mmengine.config import read_base
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_bradleyterry_postprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [
dict(
abbr='gpt4-turbo',
)
]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.',
)
],
round=[
dict(role='HUMAN', prompt=gpt4_prompt),
],
),
),
dict_postprocessor=dict(
type=alpacaeval_bradleyterry_postprocess,
),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred=[
{
'abbr': 'gpt4-turbo',
'path': './data/subjective/alpaca_eval/gpt4-turbo',
}
],
)
)

View File

@ -0,0 +1,173 @@
from opencompass.datasets import (
CompassArenaDataset,
compassarena_bradleyterry_postprocess,
)
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path = 'data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = (
"""
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
language_prompt = (
"""
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
math_prompt = (
"""
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
"""
+ base_prompt
)
reason_prompt = math_prompt
creation_prompt = (
"""
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
"""
+ base_prompt
)
sub_map = {
'language': language_prompt,
'knowledge': knowledge_prompt,
'reason_v2': reason_prompt,
'math_v2': math_prompt,
'creationv2_zh': creation_prompt,
}
gpt4 = [
dict(
abbr='gpt4-turbo',
)
]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=_prompt),
]
),
),
dict_postprocessor=dict(
type=compassarena_bradleyterry_postprocess,
count_ties=True,
),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred=[
{
'abbr': 'gpt4-turbo',
'path': './data/subjective/compass_arena/gpt4-turbo',
}
],
)
)

View File

@ -0,0 +1,74 @@
from opencompass.datasets import WildBenchDataset, wildbench_bradleyterry_postprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path = './data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template="""{dialogue}"""),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(type=PromptTemplate, template="""{prompt}"""),
dict_postprocessor=dict(type=wildbench_bradleyterry_postprocess),
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor
),
pred_role='BOT',
)
base_models = [
dict(
abbr='gpt4-turbo',
),
dict(
abbr='HaiKu',
),
dict(
abbr='llama-2-70b-chat-hf',
),
]
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred=[
{'abbr': 'gpt4-turbo', 'path': './data/subjective/WildBench/gpt4'},
{
'abbr': 'llama-2-70b-chat-hf',
'path': './data/subjective/WildBench/llama2-70b',
},
{'abbr': 'HaiKu', 'path': './data/subjective/WildBench/claude'},
{
'abbr': 'llama-2-70b-chat-turbomind',
'path': './data/subjective/WildBench/llama2-70b',
},
{
'abbr': 'llama-2-70b-chat-vllm',
'path': './data/subjective/WildBench/llama2-70b',
},
],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models=base_models,
)
)

View File

@ -2,10 +2,14 @@
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
from .alignbench import alignbench_postprocess # noqa: F401, F403
from .alpacaeval import AlpacaEvalDataset # noqa: F401, F403
from .alpacaeval import alpacaeval_bradleyterry_postprocess # noqa: F401, F403
from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .arena_hard import arenahard_postprocess # noqa: F401, F403
from .compass_arena import CompassArenaDataset, compassarena_postprocess
from .compass_arena import CompassArenaDataset # noqa: F401, F403
from .compass_arena import \
compassarena_bradleyterry_postprocess # noqa: F401, F403
from .compass_arena import compassarena_postprocess # noqa: F401, F403
from .compass_arena_subjective_bench import *
from .compassbench import CompassBenchDataset # noqa: F401, F403
from .compassbench_checklist import \
@ -27,4 +31,5 @@ from .mtbench101 import mtbench101_postprocess
from .multiround import MultiroundDataset # noqa: F401, F403
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
from .wildbench import WildBenchDataset # noqa: F401, F403
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
from .wildbench import wildbench_postprocess # noqa: F401, F403

View File

@ -5,6 +5,8 @@ from collections import defaultdict
from datasets import Dataset, DatasetDict
from opencompass.datasets.subjective.compass_arena_subjective_bench import \
get_element_counts
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
@ -33,7 +35,7 @@ class AlpacaEvalDataset(BaseDataset):
'judge': {
'capability': capability,
'question': question
}
},
})
dataset = Dataset.from_list(raw_data)
return dataset
@ -64,33 +66,54 @@ def post_process_alpacav2(completion: str):
@DICT_POSTPROCESSORS.register_module('alpacaeval')
def alpacaeval_postprocess(output: dict, output_path: str) -> dict:
def alpacaeval_postprocess(
output: dict,
output_path: str,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_alpacav2)
result=output,
filename=output_path,
post_process=post_process_alpacav2,
)
if len(judged_answers) == 0:
scores = None
win_model1, win_model2, categories = defaultdict(float), defaultdict(
float), defaultdict(float)
model1, model2 = references[0]['answer1'], references[0]['answer2']
for prediction, reference in zip(judged_answers, references):
win_model1, win_model2, categories = (
defaultdict(float),
defaultdict(float),
defaultdict(float),
)
if 'base_models' in references[0]:
base_models = references[0]['base_models']
else:
# TODO: Assuming the first model in the first record to be the base model
# Might not necessarily be the case if infer_order=="random"
base_models = [references[0]['answer1']]
if isinstance(base_models, str):
base_models = [base_models]
for judged_answer, reference in zip(judged_answers, references):
categories['total'] += 1
categories[reference['capability']] += 1
if prediction['rank'] == 1:
if reference['answer1'] == model1:
if judged_answer['rank'] == 1:
if reference['answer1'] in base_models:
win_model1[reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[reference['capability']] += 1
win_model2['total'] += 1
else:
if reference['answer1'] == model1:
if reference['answer1'] in base_models:
win_model2[reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[reference['capability']] += 1
win_model1['total'] += 1
for capability in categories:
if capability not in win_model1:
win_model1[capability] = 0.0
@ -106,3 +129,78 @@ def alpacaeval_postprocess(output: dict, output_path: str) -> dict:
results = win_model2
results['details'] = output
return results
@DICT_POSTPROCESSORS.register_module('alpacaeval_bradleyterry')
def alpacaeval_bradleyterry_postprocess(
output: dict,
output_path: str,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
result=output,
filename=output_path,
post_process=post_process_alpacav2,
)
if 'prediction1' not in references[0]:
raise ValueError(
'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
if 'prediction2' not in references[0]:
raise ValueError(
'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
if 'base_models' in references[0]:
base_models = references[0]['base_models']
else:
# TODO: Assuming the first model in the first record to be the base model
# Might not necessarily be the case if infer_order=="random"
base_models = [references[0]['answer1']]
if isinstance(base_models, str):
base_models = [base_models]
results = {}
matches = []
for judged_answer, reference in zip(judged_answers, references):
cur_dict = {}
if judged_answer['rank'] == 1:
if reference['answer1'] in base_models:
cur_dict['winner'] = 'model_a'
else:
cur_dict['winner'] = 'model_b'
elif judged_answer['rank'] == 2:
if reference['answer1'] in base_models:
cur_dict['winner'] = 'model_b'
else:
cur_dict['winner'] = 'model_a'
else:
cur_dict['winner'] = 'tie'
cur_dict['capability'] = reference['capability']
cur_dict['model_a'] = reference['answer1']
cur_dict['model_b'] = reference['answer2']
cur_dict['prediction1'] = reference['prediction1']
cur_dict['prediction2'] = reference['prediction2']
matches.append(cur_dict)
### ---------- Add Style Metadata ---------- ###
matches = get_element_counts(
data=matches,
column='prediction1',
suffix='_a',
)
matches = get_element_counts(
data=matches,
column='prediction2',
suffix='_b',
)
results['matches'] = matches
# results["details"] = output
return results

View File

@ -4,6 +4,8 @@ from collections import defaultdict
from datasets import Dataset
from opencompass.datasets.subjective.compass_arena_subjective_bench import \
get_element_counts
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from .subjective_cmp import SubjectiveCmpDataset
@ -48,7 +50,8 @@ def check_position_bias(judged_answers, references, banned_choice=['C']):
}
else:
first_judge = position_bias_dict[question_hash]['judge']
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
if (judge == first_judge and first_judge not in banned_choice
and judge not in banned_choice):
# If second choice is same with first choice, there has position bias.
position_bias_flag += 1
return position_bias_flag
@ -63,10 +66,12 @@ def post_process_compassarena(item):
@DICT_POSTPROCESSORS.register_module('compassarena')
def compassarena_postprocess(output: dict,
output_path: str,
summary_type='single',
check_pos_bias=True) -> dict:
def compassarena_postprocess(
output: dict,
output_path: str,
summary_type='single',
check_pos_bias=True,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_compassarena)
@ -116,3 +121,65 @@ def compassarena_postprocess(output: dict,
results = win_model2
results['details'] = output
return results
@DICT_POSTPROCESSORS.register_module('compassarena_bradleyterry')
def compassarena_bradleyterry_postprocess(
output: dict,
output_path: str,
count_ties: bool = True,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
result=output,
filename=output_path,
post_process=post_process_compassarena,
)
if 'prediction1' not in references[0]:
raise ValueError(
'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
if 'prediction2' not in references[0]:
raise ValueError(
'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
results = {}
matches = []
for judged_answer, reference in zip(judged_answers, references):
cur_dict = {}
if judged_answer.strip() == 'A':
cur_dict['winner'] = 'model_a'
elif judged_answer.strip() == 'B':
cur_dict['winner'] = 'model_b'
elif judged_answer.strip() == 'C' and count_ties:
cur_dict['winner'] = 'tie'
else:
continue
cur_dict['capability'] = reference['capability']
cur_dict['model_a'] = reference['answer1']
cur_dict['model_b'] = reference['answer2']
cur_dict['prediction1'] = reference['prediction1']
cur_dict['prediction2'] = reference['prediction2']
matches.append(cur_dict)
### ---------- Add Style Metadata ---------- ###
matches = get_element_counts(
data=matches,
column='prediction1',
suffix='_a',
)
matches = get_element_counts(
data=matches,
column='prediction2',
suffix='_b',
)
results['matches'] = matches
# results["details"] = output
return results

View File

@ -5,6 +5,8 @@ from collections import defaultdict
from datasets import Dataset, DatasetDict
from opencompass.datasets.subjective.compass_arena_subjective_bench import \
get_element_counts
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
@ -204,7 +206,7 @@ def parse_conversation(conversation):
last_query = conversation[-1]['content']
chat_round.append({
'role': role_dict[conversation[-1]['role']],
'content': conversation[-1]['content']
'content': conversation[-1]['content'],
})
chat_round.append({'role': 'assistant', 'content': ''})
@ -249,7 +251,7 @@ class WildBenchDataset(BaseDataset):
'primary_tag': item['primary_tag'],
'secondary_tag': item['secondary_tag'],
'question_id': item['session_id'],
}
},
})
dataset = Dataset.from_list(raw_data)
return dataset
@ -267,13 +269,13 @@ task_group_new = {
'Role playing': 'Creative Tasks',
'Advice seeking': 'Information/Advice seeking',
'Data Analysis': 'Math & Data Analysis',
'Others': 'Creative Tasks'
'Others': 'Creative Tasks',
}
def post_process_wildbench_pair(judgement: dict):
judgement = judgement['prediction']
pattern = r'\"choice\": \"(.*?)\"'
pattern = r"\"choice\": \"(.*?)\""
matched_result = re.findall(pattern, judgement)
if matched_result:
return matched_result[0]
@ -283,7 +285,7 @@ def post_process_wildbench_pair(judgement: dict):
def post_process_wildbench_single(judgement: dict):
judgement = judgement['prediction']
pattern = r'\"score\": \"(.*?)\"'
pattern = r"\"score\": \"(.*?)\""
matched_result = re.findall(pattern, judgement)
try:
score = float(matched_result[0])
@ -299,23 +301,36 @@ def post_process_wildbench_single(judgement: dict):
@DICT_POSTPROCESSORS.register_module('wildbench')
def wildbench_postprocess(output: dict, output_path: str) -> dict:
def wildbench_postprocess(
output: dict,
output_path: str,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_wildbench_pair)
result=output,
filename=output_path,
post_process=post_process_wildbench_pair,
)
if 'base_models' in references[0]:
base_models = references[0]['base_models']
else:
base_models = ['HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf']
if isinstance(base_models, str):
base_models = [base_models]
win_base_model = defaultdict(float)
win_compare_model = defaultdict(float)
categories = defaultdict(float)
score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1}
for prediction, reference in zip(judged_answers, references):
if prediction not in score_mapping:
for judged_answer, reference in zip(judged_answers, references):
if judged_answer not in score_mapping:
continue
flag = 1 if reference['answer1'] in [
'HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf'
] else -1
score_1 = score_mapping[prediction] * flag
flag = 1 if reference['answer1'] in base_models else -1
score_1 = score_mapping[judged_answer] * flag
score_2 = -score_1
tags = [reference['primary_tag']] + reference['secondary_tag']
@ -325,11 +340,11 @@ def wildbench_postprocess(output: dict, output_path: str) -> dict:
categories[task_group_new[tag]] += 1
for capability in categories:
win_base_model[capability] = win_base_model[capability] / categories[
capability] * 100
win_base_model[capability] = (win_base_model[capability] /
categories[capability] * 100)
win_base_model[capability] = round(win_base_model[capability], 2)
win_compare_model[capability] = win_compare_model[
capability] / categories[capability] * 100
win_compare_model[capability] = (win_compare_model[capability] /
categories[capability] * 100)
win_compare_model[capability] = round(win_compare_model[capability], 2)
# Calculating the mean of the values
@ -341,3 +356,82 @@ def wildbench_postprocess(output: dict, output_path: str) -> dict:
results = win_compare_model
results['details'] = output
return results
@DICT_POSTPROCESSORS.register_module('wildbench_bradleyterry')
def wildbench_bradleyterry_postprocess(
output: dict,
output_path: str,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
result=output,
filename=output_path,
post_process=post_process_wildbench_pair,
)
if 'prediction1' not in references[0]:
raise ValueError(
'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
if 'prediction2' not in references[0]:
raise ValueError(
'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
)
score_mapping = {
'A++': 'model_a',
'A+': 'model_a',
'A=B': 'tie',
'B+': 'model_b',
'B++': 'model_b',
}
results = {}
matches = []
for judged_answer, reference in zip(judged_answers, references):
cur_dict = {}
if judged_answer in score_mapping:
cur_dict['winner'] = score_mapping[judged_answer]
else:
# cur_dict["winner"] = (
# "tie" # Count match as tie if judge answer cannot be parsed.
# )
# Skip if judge answer cannot be parsed
print('Judge answer cannot be parsed. Skipping record...')
continue
cur_dict['primary_tag'] = reference['primary_tag']
# Extract first tag from list and set as categorical level.
# Can be used as categorical variable in Bradley-Terry model
cur_dict['secondary_tag'] = (reference['secondary_tag'][0]
if len(reference['secondary_tag']) > 0
else 'Others')
# Keep original secondary tag list for reference
cur_dict['secondary_tags'] = reference['secondary_tag']
cur_dict['model_a'] = reference['answer1']
cur_dict['model_b'] = reference['answer2']
cur_dict['prediction1'] = reference['prediction1']
cur_dict['prediction2'] = reference['prediction2']
matches.append(cur_dict)
### ---------- Add Style Metadata ---------- ###
matches = get_element_counts(
data=matches,
column='prediction1',
suffix='_a',
)
matches = get_element_counts(
data=matches,
column='prediction2',
suffix='_b',
)
results['matches'] = matches
# results["details"] = output
return results

View File

@ -2,7 +2,7 @@
import os.path as osp
import random
import re
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union
import mmengine
from datasets import Dataset
@ -32,6 +32,7 @@ def order_preds_and_record_references(
infer_order: List,
seed: int = 666,
keep_preds: bool = False,
base_model_abbrs: List[str] = None,
):
"""Order predictions based on args and recording regrading references.
@ -41,6 +42,7 @@ def order_preds_and_record_references(
infer_order (str, optional): The mode of inference order.
seed (int, optional): Random seed.
keep_preds (bool, optional): Whether to save model predictions in references. This will be available as input in postprocessor. Defaults to False.
base_model_abbrs (List[str], optional): List of base models passed from dataset cfg.
"""
random.seed(seed)
list_of_preds = [[] for _ in range(len(predictions))]
@ -56,6 +58,12 @@ def order_preds_and_record_references(
if keep_preds:
references[i][f'prediction{j+1}'] = preds[j][0]
if base_model_abbrs is not None:
if isinstance(base_model_abbrs, str):
base_model_abbrs = [base_model_abbrs]
references[i]['base_models'] = base_model_abbrs
if infer_order == 'double':
assert len(predictions) == 2
list_of_preds = [
@ -77,6 +85,7 @@ def order_preds_and_record_references(
reversed_references.append(reversed_item)
references += reversed_references
return list_of_preds, references
@ -164,11 +173,24 @@ class LMEvaluator:
{} for _ in range(len(predictions[0]['model_preds']))
]
base_model_abbrs = None
if self.dataset_cfg is not None:
if 'base_models' in self.dataset_cfg:
base_models = self.dataset_cfg['base_models']
if isinstance(base_models, Dict):
base_models = [base_models]
base_model_abbrs = [
base_mdl['abbr'] for base_mdl in base_models
]
predictions, references = order_preds_and_record_references(
predictions=predictions,
references=references,
infer_order=infer_order,
keep_preds=self.keep_predictions,
base_model_abbrs=base_model_abbrs,
)
# calculate dupicated predictions numbers

View File

@ -757,16 +757,17 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
# if rating_system == "bradleyterry":
if with_control_vars:
bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
elo_rating_final, coef_final = compute_style_control(
df=matches_df,
num_round=num_bootstrap,
baseline_model=base_model,
normalize_style_features=self.normalize_style_features,
control_variables=groups,
odds_ratio=self.odds_ratio,
)
elo_rating_final, coef_final = compute_style_control(
bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
df=matches_df,
num_round=num_bootstrap,
baseline_model=base_model,
normalize_style_features=self.normalize_style_features,
control_variables=groups,
@ -953,7 +954,9 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
time_str (str, optional): Timestamp for file suffix. Defaults to
datetime.now().strftime('%Y%m%d_%H%M%S').
"""
all_scores_df_list = []
all_scores = {}
all_scores_ctrl_coefs = {}
for judge_model in self.judge_models:
control_coefficients = {}
leaderboard_tables = {}
@ -986,7 +989,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
print('-' * 10 +
f"{dataset_abbr + ':' + base_model_abbr}\n" +
'-' * 10)
# print(cur_table_df)
print(cur_table_df)
print(cur_ctrl_coefs)
leaderboard_tables = self.flip_dict_levels(leaderboard_tables)
@ -1002,17 +1005,43 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
)
# Fit another BT model with the first base_model and combining matches from all datasets
all_scores_df, all_scores_ctrl_coefs = self._calculate_ratings(
matches=all_matches,
base_model=list(base_models)[0],
groups=self.groups,
)
cur_judge_all_scores_df, cur_judge_all_scores_ctrl_coefs = (
self._calculate_ratings(
matches=all_matches,
base_model=list(base_models)[0],
groups=self.groups,
))
cur_judge_all_scores_df['judge'] = judge_abbr
all_scores_df_list.append(cur_judge_all_scores_df)
all_scores[judge_abbr] = pd.Series(
all_scores_df['rating'],
index=all_scores_df['model_name'],
cur_judge_all_scores_df['rating'],
index=cur_judge_all_scores_df['model_name'],
).to_dict()
all_scores_ctrl_coefs[judge_abbr] = cur_judge_all_scores_ctrl_coefs
all_scores_df = pd.concat(all_scores_df_list)
output_path_all_scores_df = osp.join(
self.work_dir, 'summary', f'summary_{time_str}_all_scores_df.csv')
output_path_all_scores = osp.join(
self.work_dir, 'summary', f'summary_{time_str}_all_scores.json')
output_path_all_scores_ctrl_coefs = osp.join(
self.work_dir, 'summary',
f'summary_{time_str}_all_scores_ctrl_coefs.json')
all_scores_df.to_csv(output_path_all_scores_df)
with open(output_path_all_scores, 'w', encoding='utf-8') as f:
json.dump(all_scores, f, ensure_ascii=False, indent=4)
with open(output_path_all_scores_ctrl_coefs, 'w',
encoding='utf-8') as f:
json.dump(all_scores_ctrl_coefs, f, ensure_ascii=False, indent=4)
print(f'{all_scores_df=}')
print(f'{all_scores=}')
print(f'{all_scores_ctrl_coefs=}')