OpenCompass/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py

from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base

subjective_reader_cfg = dict(
    input_columns=['question'],
    output_column='judge',
    )

subjective_all_sets = [
    'alpaca_eval',
]


alpacav2_datasets = []

gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.

## Instruction

{
    "instruction": "{question}",
}

## Model Outputs

Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.

{
    {
        "model_identifier": "m",
        "output": "{prediction}"
    },
    {
        "model_identifier": "M",
        "output": "{prediction2}"
    }
}

## Task

Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.

## Best Model Identifier
"""

api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)

gpt4 = [dict(
    abbr='gpt4-turbo',
)]

for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt='{question}'
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer),
        )

    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = gpt4_prompt
                    ),
                ]),
            ),
        ),
        pred_role='BOT',
    )

    alpacav2_datasets.append(
        dict(
            abbr=f'{_name}',
            type=SubjectiveCmpDataset,
            path='./data/subjective/alpaca_eval',
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='m2n',
            infer_order='random',
            base_models=gpt4,
            given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
            summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
        ))
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`from opencompass.openicl.icl_prompt_template import PromptTemplate`
			`from opencompass.openicl.icl_retriever import ZeroRetriever`
			`from opencompass.openicl.icl_inferencer import GenInferencer`
			`from opencompass.openicl.icl_evaluator import LMEvaluator`
[Fix] Compatible with old versions (#1616) * fix pip version * fix pip version * Compatible with old versions * compati old version * compati old version * compati old version * update configs 2024-10-21 10:16:29 +08:00			`from opencompass.datasets import SubjectiveCmpDataset`
			`from opencompass.summarizers import AlpacaSummarizer`
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`from mmengine.config import read_base`

			`subjective_reader_cfg = dict(`
			`input_columns=['question'],`
			`output_column='judge',`
			`)`

			`subjective_all_sets = [`
			`'alpaca_eval',`
			`]`


			`alpacav2_datasets = []`

			`gpt4_prompt = """`
			`I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.`

			`## Instruction`

			`{`
			`"instruction": "{question}",`
			`}`

			`## Model Outputs`

			`Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.`

			`{`
			`{`
			`"model_identifier": "m",`
			`"output": "{prediction}"`
			`},`
			`{`
			`"model_identifier": "M",`
			`"output": "{prediction2}"`
			`}`
			`}`

			`## Task`

			`Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.`

			`## Best Model Identifier`
			`"""`

			`api_meta_template = dict(`
			`round=[`
			`dict(role='HUMAN', api_role='HUMAN'),`
			`dict(role='BOT', api_role='BOT', generate=True),`
			`],`
			`reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],`
			`)`

			`gpt4 = [dict(`
			`abbr='gpt4-turbo',`
			`)]`

			`for _name in subjective_all_sets:`
			`subjective_infer_cfg = dict(`
			`prompt_template=dict(`
			`type=PromptTemplate,`
			`template=dict(round=[`
			`dict(`
			`role='HUMAN',`
			`prompt='{question}'`
			`),`
			`]),`
			`),`
			`retriever=dict(type=ZeroRetriever),`
[Feature] Support subjective evaluation for reasoning model (#1868) * fix pip version * fix pip version * add subeval for reasoning model * add subeval for reasoning model * update configs * update config * update config * update config * update files 2025-02-20 12:19:46 +08:00			`inferencer=dict(type=GenInferencer),`
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`)`

			`subjective_eval_cfg = dict(`
			`evaluator=dict(`
			`type=LMEvaluator,`
[Feature] Support subjective evaluation for reasoning model (#1868) * fix pip version * fix pip version * add subeval for reasoning model * add subeval for reasoning model * update configs * update config * update config * update config * update files 2025-02-20 12:19:46 +08:00
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`prompt_template=dict(`
			`type=PromptTemplate,`
			`template=dict(`
			`begin=[`
			`dict(`
			`role='SYSTEM',`
			`fallback_role='HUMAN',`
			`prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')`
			`],`
			`round=[`
			`dict(`
			`role='HUMAN',`
			`prompt = gpt4_prompt`
			`),`
			`]),`
			`),`
			`),`
			`pred_role='BOT',`
			`)`

			`alpacav2_datasets.append(`
			`dict(`
			`abbr=f'{_name}',`
[Fix] Compatible with old versions (#1616) * fix pip version * fix pip version * Compatible with old versions * compati old version * compati old version * compati old version * update configs 2024-10-21 10:16:29 +08:00			`type=SubjectiveCmpDataset,`
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`path='./data/subjective/alpaca_eval',`
			`name=_name,`
			`reader_cfg=subjective_reader_cfg,`
			`infer_cfg=subjective_infer_cfg,`
			`eval_cfg=subjective_eval_cfg,`
			`mode='m2n',`
			`infer_order='random',`
			`base_models=gpt4,`
[Fix] Compatible with old versions (#1616) * fix pip version * fix pip version * Compatible with old versions * compati old version * compati old version * compati old version * update configs 2024-10-21 10:16:29 +08:00			`given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],`
			`summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),`
[Feature] Support import configs/models/summarizers from whl (#1376) * [Feature] Support import configs/models/summarizers from whl * Update LCBench configs * Update * Update * Update * Update * update * Update * Update * Update * Update * Update 2024-08-01 00:42:48 +08:00			`))`