2024-08-01 00:42:48 +08:00
from opencompass . openicl . icl_prompt_template import PromptTemplate
from opencompass . openicl . icl_retriever import ZeroRetriever
from opencompass . openicl . icl_inferencer import GenInferencer
from opencompass . openicl . icl_evaluator import LMEvaluator
2024-10-21 10:16:29 +08:00
from opencompass . datasets import SubjectiveCmpDataset
from opencompass . summarizers import AlpacaSummarizer
2024-08-01 00:42:48 +08:00
from mmengine . config import read_base
subjective_reader_cfg = dict (
input_columns = [ ' question ' ] ,
output_column = ' judge ' ,
)
subjective_all_sets = [
' alpaca_eval ' ,
]
alpacav2_datasets = [ ]
gpt4_prompt = """
I require a leaderboard for various large language models . I ' ll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
" instruction " : " {question} " ,
}
## Model Outputs
Here are the unordered outputs from the models . Each output is associated with a specific model , identified by a unique model identifier .
{
{
" model_identifier " : " m " ,
" output " : " {prediction} "
} ,
{
" model_identifier " : " M " ,
" output " : " {prediction2} "
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs , and select the model that generated the best output . Answer by providing the model identifier of the best model . We will use your output as the name of the best model , so make sure your output only contains one of the following model identifiers and nothing else ( no quotes , no spaces , no new lines , . . . ) : m or M .
## Best Model Identifier
"""
api_meta_template = dict (
round = [
dict ( role = ' HUMAN ' , api_role = ' HUMAN ' ) ,
dict ( role = ' BOT ' , api_role = ' BOT ' , generate = True ) ,
] ,
reserved_roles = [ dict ( role = ' SYSTEM ' , api_role = ' SYSTEM ' ) ] ,
)
gpt4 = [ dict (
abbr = ' gpt4-turbo ' ,
) ]
for _name in subjective_all_sets :
subjective_infer_cfg = dict (
prompt_template = dict (
type = PromptTemplate ,
template = dict ( round = [
dict (
role = ' HUMAN ' ,
prompt = ' {question} '
) ,
] ) ,
) ,
retriever = dict ( type = ZeroRetriever ) ,
2025-02-20 12:19:46 +08:00
inferencer = dict ( type = GenInferencer ) ,
2024-08-01 00:42:48 +08:00
)
subjective_eval_cfg = dict (
evaluator = dict (
type = LMEvaluator ,
2025-02-20 12:19:46 +08:00
2024-08-01 00:42:48 +08:00
prompt_template = dict (
type = PromptTemplate ,
template = dict (
begin = [
dict (
role = ' SYSTEM ' ,
fallback_role = ' HUMAN ' ,
prompt = ' You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers. ' )
] ,
round = [
dict (
role = ' HUMAN ' ,
prompt = gpt4_prompt
) ,
] ) ,
) ,
) ,
pred_role = ' BOT ' ,
)
alpacav2_datasets . append (
dict (
abbr = f ' { _name } ' ,
2024-10-21 10:16:29 +08:00
type = SubjectiveCmpDataset ,
2024-08-01 00:42:48 +08:00
path = ' ./data/subjective/alpaca_eval ' ,
name = _name ,
reader_cfg = subjective_reader_cfg ,
infer_cfg = subjective_infer_cfg ,
eval_cfg = subjective_eval_cfg ,
mode = ' m2n ' ,
infer_order = ' random ' ,
base_models = gpt4 ,
2024-10-21 10:16:29 +08:00
given_pred = [ { ' abbr ' : ' gpt4-turbo ' , ' path ' : ' ./data/subjective/alpaca_eval/gpt4-turbo ' } ] ,
summarizer = dict ( type = AlpacaSummarizer , judge_type = ' v2 ' ) ,
2024-08-01 00:42:48 +08:00
) )