OpenCompass/opencompass/datasets/subjective/wildbench.py

# flake8: noqa: F401, F403
import json
import re
from collections import defaultdict

from datasets import Dataset, DatasetDict

from opencompass.datasets.subjective.compass_arena_subjective_bench import \
    get_element_counts
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference

score_prompt = """# Instruction

You are an expert evaluator. Your task is to evaluate the quality of \
the responses generated by AI models.
We will provide you with the user query and an AI-generated responses.
You should first read the user query and the conversation history \
carefully for analyzing the task, and then evaluate the quality of \
the responses based on and rules provided below.

# Conversation between User and AI

## History
<|begin_of_history|>

{history}

<|end_of_history|>

## Current User Query
<|begin_of_query|>

{user_query}

<|end_of_query|>

## AI Response
<|begin_of_response|>

{prediction}

<|end_of_response|>


# Evaluation

## Checklist

<|begin_of_checklist|>

{checklist}

<|end_of_checklist|>

Please use this checklist to guide your evaluation, but do \
not limit your assessment to the checklist.

## Rules

You should compare the above response based on your analysis\
 of the user queries and the conversation history.
You should first write down your analysis and the checklist \
that you used for the evaluation, and then provide your \
assessment according to the checklist.
The scores are in the range of 1~10, where 1 means the \
response is very poor and 10 means the response is perfect.
Here are more detailed criteria for the scores:

- Score 1~2: The response is very poor and does not make sense at all.
- Score 3~4: The response is poor and does help user solve the problem\
 in a meaningful way.
- Score 5~6: The response is fair but has some issues (e.g., factual \
errors, hallucinations, missing key information).
- Score 7~8: The response is good enough but could be improved in some ways.
- Score 9~10: The response is perfect and provides helpful information that\
 can help user solve the problem.

## Output Format
First, please output your analysis for the model response, and then summarize\
 your assessment to two aspects: "strengths" and "weaknesses"; Finally, please\
 write down your rating for the assessment.

Please provide your evaluation results in the following json format by filling\
 in the placeholders in []:
```
{
    "strengths": "[analysis for the strengths of the response]",
    "weaknesses": "[analysis for the weaknesses of the response]",
    "score": "[1~10]"
}
```"""

pair_prompt = """# Instruction

You are an expert evaluator. Your task is to evaluate the quality of the \
responses generated by two AI models.
We will provide you with the user query and a pair of AI-generated \
responses (Response A and Response B).
You should first read the user query and the conversation history \
carefully for analyzing the task, and then evaluate the quality of the \
responses based on and rules provided below.

# Conversation between User and AI

## History
<|begin_of_history|>

{history}

<|end_of_history|>

## Current User Query
<|begin_of_query|>

{user_query}

<|end_of_query|>

## Response A
<|begin_of_response_A|>

{prediction}

<|end_of_response_A|>

## Response B
<|begin_of_response_B|>

{prediction2}

<|end_of_response_B|>

# Evaluation

## Checklist

<|begin_of_checklist|>

{checklist}

<|end_of_checklist|>

Please use this checklist to guide your evaluation, but do not limit your \
assessment to the checklist.

## Rules

You should compare the above two responses based on your analysis of the \
user queries and the conversation history.
You should first write down your analysis and the checklist that you used \
for the evaluation, and then provide your assessment according to the \
checklist.
There are five choices to give your final assessment: ["A++", "A+", \
"A=B", "B+", "B++"], which correspond to the following meanings:

- `A++`: Response A is much better than Response B.
- `A+`: Response A is only slightly better than Response B.
- `A=B`: Response A and B are of the same quality. Please use this \
choice sparingly.
- `B+`: Response B is only slightly better than Response A.
- `B++`: Response B is much better than Response A.


## Output Format
First, please output your analysis for each model response, and \
then summarize your assessment to three aspects: "reason A=B", \
"reason A>B", and "reason B>A", and finally make your choice for \
the final assessment.

Please provide your evaluation results in the following json \
format by filling in the placeholders in []:
```
{
    "analysis of A": "[analysis of Response A]",
    "analysis of B": "[analysis of Response B]",
    "reason of A=B": "[where Response A and B perform equally well]",
    "reason of A>B": "[where Response A is better than Response B]",
    "reason of B>A": "[where Response B is better than Response A]",
    "choice": "[A++ or A+ or A=B or B+ or B++]",
}
```
"""


def parse_conversation(conversation):
    # parse conversation into chat dialogue
    role_dict = {'user': 'HUMAN', 'assistant': 'assistant'}
    chat_round = []
    history = ''
    if len(conversation) > 0:
        for x in conversation[:-1]:
            if x['role'] == 'user':
                history += 'USER: ' + x['content'] + '\n\n'
            elif x['role'] == 'assistant':
                history += 'ASSISTANT: ' + x['content'] + '\n\n'

            chat_round.append({
                'role': role_dict[x['role']],
                'content': x['content']
            })

    last_query = conversation[-1]['content']
    chat_round.append({
        'role': role_dict[conversation[-1]['role']],
        'content': conversation[-1]['content'],
    })
    chat_round.append({'role': 'assistant', 'content': ''})

    return chat_round, last_query, history


@LOAD_DATASET.register_module()
class WildBenchDataset(BaseDataset):

    def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        dataset = DatasetDict()
        raw_data = []
        with open(path, 'r', encoding='utf-8') as file:
            for line in file:
                item = json.loads(line)
                chat_round, last_query, history = parse_conversation(
                    item['turn'])

                checklist_mardkdown = ''
                for checklist_item in item['checklist']:
                    checklist_mardkdown += f'- {checklist_item}\n'

                if eval_mode == 'single':
                    prompt = score_prompt
                elif eval_mode == 'pair':
                    prompt = pair_prompt
                else:
                    assert NotImplementedError(
                        f'Eval mode {eval_mode} not in single or pair.')

                prompt = prompt.replace('{history}', history)
                prompt = prompt.replace('{user_query}', last_query)
                prompt = prompt.replace('{checklist}', checklist_mardkdown)

                raw_data.append({
                    'dialogue': chat_round,
                    'history': history,
                    'prompt': prompt,
                    'judge': {
                        'other': None,
                        'primary_tag': item['primary_tag'],
                        'secondary_tag': item['secondary_tag'],
                        'question_id': item['session_id'],
                    },
                })
        dataset = Dataset.from_list(raw_data)
        return dataset


task_group_new = {
    'Information seeking': 'Information/Advice seeking',
    'Creative Writing': 'Creative Tasks',
    'Coding & Debugging': 'Coding & Debugging',
    'Reasoning': 'Planning & Reasoning',
    'Editing': 'Creative Tasks',
    'Math': 'Math & Data Analysis',
    'Planning': 'Planning & Reasoning',
    'Brainstorming': 'Creative Tasks',
    'Role playing': 'Creative Tasks',
    'Advice seeking': 'Information/Advice seeking',
    'Data Analysis': 'Math & Data Analysis',
    'Others': 'Creative Tasks',
}


def post_process_wildbench_pair(judgement: dict):
    judgement = judgement['prediction']
    pattern = r"\"choice\": \"(.*?)\""
    matched_result = re.findall(pattern, judgement)
    if matched_result:
        return matched_result[0]
    else:
        return None


def post_process_wildbench_single(judgement: dict):
    judgement = judgement['prediction']
    pattern = r"\"score\": \"(.*?)\""
    matched_result = re.findall(pattern, judgement)
    try:
        score = float(matched_result[0])
        return {'score': score}
    except (ValueError, IndexError) as e:
        return None

    # if matched_result:
    #     score = float(matched_result[0])
    # else:
    #     return None
    # return {'score': score}


@DICT_POSTPROCESSORS.register_module('wildbench')
def wildbench_postprocess(
    output: dict,
    output_path: str,
) -> dict:

    judged_answers, references = get_judgeanswer_and_reference(
        result=output,
        filename=output_path,
        post_process=post_process_wildbench_pair,
    )

    if 'base_models' in references[0]:
        base_models = references[0]['base_models']
    else:
        base_models = ['HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf']

    if isinstance(base_models, str):
        base_models = [base_models]

    win_base_model = defaultdict(float)
    win_compare_model = defaultdict(float)
    categories = defaultdict(float)

    score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1}
    for judged_answer, reference in zip(judged_answers, references):
        if judged_answer not in score_mapping:
            continue

        flag = 1 if reference['answer1'] in base_models else -1
        score_1 = score_mapping[judged_answer] * flag
        score_2 = -score_1

        tags = [reference['primary_tag']] + reference['secondary_tag']
        for tag in tags:
            win_base_model[task_group_new[tag]] += score_1
            win_compare_model[task_group_new[tag]] += score_2
            categories[task_group_new[tag]] += 1

    for capability in categories:
        win_base_model[capability] = (win_base_model[capability] /
                                      categories[capability] * 100)
        win_base_model[capability] = round(win_base_model[capability], 2)
        win_compare_model[capability] = (win_compare_model[capability] /
                                         categories[capability] * 100)
        win_compare_model[capability] = round(win_compare_model[capability], 2)

    # Calculating the mean of the values
    average = sum(win_compare_model.values()) / len(win_compare_model)

    # Adding the mean to the dictionary at the beginning
    win_compare_model['average'] = average

    results = win_compare_model
    results['details'] = output
    return results


@DICT_POSTPROCESSORS.register_module('wildbench_bradleyterry')
def wildbench_bradleyterry_postprocess(
    output: dict,
    output_path: str,
) -> dict:

    judged_answers, references = get_judgeanswer_and_reference(
        result=output,
        filename=output_path,
        post_process=post_process_wildbench_pair,
    )

    if 'prediction1' not in references[0]:
        raise ValueError(
            'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
        )

    if 'prediction2' not in references[0]:
        raise ValueError(
            'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
        )

    score_mapping = {
        'A++': 'model_a',
        'A+': 'model_a',
        'A=B': 'tie',
        'B+': 'model_b',
        'B++': 'model_b',
    }

    results = {}
    matches = []
    for judged_answer, reference in zip(judged_answers, references):
        cur_dict = {}

        if judged_answer in score_mapping:
            cur_dict['winner'] = score_mapping[judged_answer]
        else:
            # cur_dict["winner"] = (
            #     "tie"  # Count match as tie if judge answer cannot be parsed.
            # )

            # Skip if judge answer cannot be parsed
            print('Judge answer cannot be parsed. Skipping record...')
            continue

        cur_dict['primary_tag'] = reference['primary_tag']
        # Extract first tag from list and set as categorical level.
        # Can be used as categorical variable in Bradley-Terry model
        cur_dict['secondary_tag'] = (reference['secondary_tag'][0]
                                     if len(reference['secondary_tag']) > 0
                                     else 'Others')
        # Keep original secondary tag list for reference
        cur_dict['secondary_tags'] = reference['secondary_tag']
        cur_dict['model_a'] = reference['answer1']
        cur_dict['model_b'] = reference['answer2']
        cur_dict['prediction1'] = reference['prediction1']
        cur_dict['prediction2'] = reference['prediction2']

        matches.append(cur_dict)

    ### ---------- Add Style Metadata ---------- ###
    matches = get_element_counts(
        data=matches,
        column='prediction1',
        suffix='_a',
    )
    matches = get_element_counts(
        data=matches,
        column='prediction2',
        suffix='_b',
    )

    results['matches'] = matches
    # results["details"] = output

    return results