OpenCompass/opencompass/datasets/OpenHuEval/HuSimpleQA.py

import json
import os
import re

from datasets import Dataset, DatasetDict

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils.prompt import PromptList

from ..base import BaseDataset


class HuSimpleQADataset(BaseDataset):

    @staticmethod
    def load(filepath):
        assert os.path.isfile(filepath)
        assert filepath.endswith('.jsonl')
        dataset = DatasetDict()
        f = open(filepath, 'r', encoding='utf-8')
        lines = f.readlines()
        objs = []
        for line in lines:
            obj = json.loads(line)
            objs.append(obj)
        out_dict_list = []
        for obj in objs:
            question = obj['question']
            hu_specific_dim = obj['hu_specific_dim']
            tmp = obj
            new_obj = dict(question=question,
                           hu_specific_dim=hu_specific_dim,
                           reference=tmp)
            out_dict_list.append(new_obj)
        dataset = Dataset.from_list(out_dict_list)
        return dataset


class HuSimpleQAEvaluator(BaseEvaluator):

    def __init__(self,
                 judge_prompt_template,
                 openai_key='ENV',
                 openai_proxy_url='ENV',
                 **kwargs):
        super().__init__(**kwargs)
        self.judge_prompt_template = judge_prompt_template
        self.openai_key = openai_key
        self.openai_proxy_url = openai_proxy_url

    def score(self, predictions, references, origin_prompt) -> dict:
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length.'}

        details = {}
        total, correct, wrong, not_attempted, failed_to_parse = 0, 0, 0, 0, 0
        from opencompass.models import OpenAI
        model = OpenAI(path='gpt-4o-2024-08-06',
                       key=self.openai_key,
                       openai_proxy_url=self.openai_proxy_url,
                       max_seq_len=8192,
                       retry=2,
                       temperature=0,
                       verbose=True)

        confidence_scores = []
        for raw_pred, detail in zip(predictions, references):
            total += 1
            qid = detail['qid']
            details[qid] = {
                'question': detail['question'],
                'answer': detail['answer'],
                'raw_pred': raw_pred,
                'correctness': False,
                'failed_to_parse': False
            }
            # parse raw_pred
            try:
                raw_pred = re.sub(r'^```json\n|\n```$', '', raw_pred)
                raw_pred_json = json.loads(raw_pred)
                confidence_score = raw_pred_json.get('confidence_score', None)
            except json.JSONDecodeError:
                confidence_score = None
            details[qid]['confidence_score'] = confidence_score

            # ------------------------ involve openai gpt4o as judge
            user_prompt = self.judge_prompt_template['user_prompt'].format(
                question=detail['question'],
                answer=detail['answer'],
                pred_answer=raw_pred)
            system_prompt = self.judge_prompt_template['system_prompt']
            details[qid]['judge_user_prompt'] = user_prompt

            messages = PromptList([{
                'role': 'SYSTEM',
                'prompt': system_prompt,
            }, {
                'role': 'HUMAN',
                'prompt': user_prompt,
            }])
            response = model._generate(input=messages,
                                       max_out_len=8192,
                                       temperature=0.1)
            details[qid]['judge_resp'] = response
            try:
                response = re.sub(r'^```json\n|\n```$', '', response)
                evaluation_result = json.loads(response)
                evaluation = evaluation_result.get('evaluation', '').lower()

                details[qid]['correctness'] = (evaluation == 'correct')
                details[qid]['failed_to_parse'] = False

                if evaluation == 'correct':
                    correct += 1
                elif evaluation == 'incorrect':
                    wrong += 1
                elif evaluation == 'not_attempted':
                    not_attempted += 1
                else:
                    failed_to_parse += 1

            except json.JSONDecodeError:
                details[qid]['failed_to_parse'] = True
                failed_to_parse += 1

            confidence_scores.append(
                (confidence_score, details[qid]['correctness']))

        accuracy = correct / total if total > 0 else 0

        results = {
            'accuracy': accuracy,
            'total': total,
            'correct': correct,
            'wrong': wrong,
            'not_attempted': not_attempted,
            'failed_to_parse': failed_to_parse,
            'details': details,
            'confidence_scores': confidence_scores
        }
        return results