OpenCompass/opencompass/datasets/subjective/followbench.py

# flake8: noqa
import json
import os.path as osp
import re
import statistics
from typing import Optional

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference


@LOAD_DATASET.register_module()
class FollowBenchDataset(BaseDataset):

    def load(self, path: str, name: str, cate: str, *args, **kwargs):

        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.json')
        raw_data = []
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for item in data:
                if cate == 'llm':
                    raw_data.append({
                        'instruction': item['instruction'],
                        'judge_prompt': item['judge_prompt'],
                        'judge': item
                    })
                # elif cate == 'rule':
                #     raw_data.append({
                #         'instruction': item['instruction'],
                #         'judge': item
                #     })
                else:
                    raise NotImplementedError(
                        f"Category '{cate}' is not implemented.")

        dataset = Dataset.from_list(raw_data)
        return dataset


def post_process_followbench(item):
    generation, level = item['prediction'], item['gold']['level']
    try:
        satisfy = generation.strip('```').strip().split('\n')[-1]

        if level == 1:
            if 'YES' in satisfy:
                return 1, 1
            elif 'NO' in satisfy:
                return 0, 0
            else:
                raise Exception('Invalid evaluation for level 1.')
        else:
            satisfy_list = re.search(r'\[.*\]', satisfy)
            if satisfy_list:
                satisfy_list = eval(satisfy_list.group())
                if len(satisfy_list) == level:
                    num_true = 0
                    for i in satisfy_list:
                        if i == 'YES' or i == 'True':
                            num_true += 1
                        elif i in [
                                'NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN',
                                'N/A'
                        ]:
                            num_true += 0
                        else:
                            raise Exception('Invalid element in the list.')
                    return int(num_true == level), num_true / level
                else:
                    raise Exception('Invalid number of elements in the list.')
            else:
                raise Exception('Invalid list that cannot be parsed.')

    except Exception as e:
        return -1, -1


def get_scores(judged_answers, references):
    results = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
    n_group = len(judged_answers) // 5
    n_groups = [n_group] * 5

    for judged_answer, reference in zip(judged_answers, references):
        if judged_answer[0] == -1:
            n_groups[reference['level'] - 1] -= 1
        else:
            results[0][reference['level'] - 1] += judged_answer[0]
            results[1][reference['level'] - 1] += judged_answer[1]

    for i in range(len(results)):
        for j in range(len(results[i])):
            if n_groups[j] != 0:
                results[i][j] = results[i][j] / n_groups[j]
            else:
                results[i][j] = 0
    temp_dict = {
        'HSR_AVG': statistics.mean(results[0]),
        'SSR_AVG': statistics.mean(results[1])
    }
    for idx, s in enumerate(results[0]):
        temp_dict[f'HSR_L{idx+1}'] = s
    for idx, s in enumerate(results[1]):
        temp_dict[f'SSR_L{idx+1}'] = s

    return temp_dict


@DICT_POSTPROCESSORS.register_module('followbench')
def followbench_postprocess(
    output: dict,
    output_path: str,
) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_followbench)

    results = get_scores(judged_answers, references)
    results['details'] = output
    return results


# def check_match(target, generation):
#     pattern_str = target.replace('{{',
#                                  '{').replace('}}',
#                                               '}').replace('{answer}', '.*')
#     pattern_str = re.escape(pattern_str).replace('\\.\\*', '.*')
#     match = re.fullmatch(pattern_str, generation)
#     return bool(match)

# class FollowBenchEvaluator(BaseEvaluator):
#     """Evaluator for followbench rule-based eval."""

#     def __init__(self, num_workers=16) -> None:
#         self.num_workers = num_workers

#     def score_single(self, pred, refer):
#         generated_code = pred

#         # get current dir because we will enter a temp dir to
#         # execute generated code
#         cwd = os.getcwd()

#         def chdir_return(cwd, return_value):
#             os.chdir(cwd)
#             return return_value

#         # we create a tempdir to execute each generated program
#         with tempfile.TemporaryDirectory() as tempdir_name:

#             tempdir_name = Path(tempdir_name)
#             # copy all files and data dependencies from
#             shutil.copytree(refer['problem_path'],
#                             tempdir_name,
#                             dirs_exist_ok=True)
#             # generated outputs will be put into `result`
#             os.mkdir(tempdir_name / 'result')

#             program = refer['code_context'].replace('[insert]', generated_code)
#             with open(tempdir_name / 'program.py', 'w', encoding='UTF-8') as f:
#                 f.write(program)

#             # enter into the tempdir to execute
#             os.chdir(tempdir_name)

#             execution_status = []
#             # a question may not have test case but we can still execute and
#             # see if there is error
#             test_cnt = max(1, int(refer['test_case_cnt']))
#             for i in range(1, test_cnt + 1):
#                 # notice this command, e.g., you may need to
#                 # replace `python` with `python3`
#                 cmd_text = f'python program.py --test_case {i}'
#                 time_limit = 60  # should not change the official time_limit
#                 cmd = Command(cmd_text, )
#                 exit_code = cmd.run(
#                     timeout=time_limit)  # 0 if there is no error
#                 execution_status.append(exit_code)

#             # return if execution error
#             if sum(execution_status) > 0:
#                 return chdir_return(cwd, False)

#             # loading testing code as a module
#             test_module = import_source_file(tempdir_name / 'test_code.py',
#                                              'test_code')
#             pass_flag = True

#             if int(refer['test_type']) == 3:
#                 # stringTest parses the generated code into AST
#                 # and check AST components
#                 # if there is static error, stringTest may raise an exception
#                 generated_code = generated_code.split('\n')
#                 for line in generated_code:
#                     if 'print' in line and '#' not in line.split('print'):
#                         generated_code.remove(line)
#                 generated_code = '\n'.join(generated_code)
#                 try:
#                     pass_flag = test_module.stringTest(generated_code)
#                 except Exception:
#                     # return False if stringTest error
#                     return chdir_return(cwd, False)

#             test_cnt = max(int(refer['test_case_cnt']), 1)
#             for i in range(1, test_cnt + 1):
#                 try:
#                     ans = pickle.load(open(f'ans/ans{i}.pkl', 'rb'))
#                     # loading the generated output might still raise Exception
#                     # if the generated code is not correct
#                     result = pickle.load(open(f'result/result_{i}.pkl', 'rb'))
#                     pass_flag = test_module.test(result, ans) == 1
#                 except Exception:
#                     # return False if stringTest error
#                     return chdir_return(cwd, False)

#         return chdir_return(cwd, pass_flag)

#     def score(self, predictions, references):
#         results = {'example': {'accuracy': [0, 0, 0, 0, 0], 'num': 0}}
#         for prediction, reference in zip(predictions, references):
#             if reference['category'] == 'example':
#                 results['example']['num'] += 1
#                 template = reference['target'].replace('{instruction}\n', '')
#                 match_result = check_match(template, prediction)
#                 print(match_result)
#                 results['example']['accuracy'][reference['level'] -
#                                                1] += match_result
#         results['example']['accuracy'] = [
#             round(acc / (results['example']['num'] // 5), 2)
#             for acc in results['example']['accuracy']
#         ]
#         ######## Still not finished for rule-based evaluation

#         # Each process changes cwd, need to use multi-processing
#         with ProcessPoolExecutor(self.num_workers) as executor:
#             passed = sum(
#                 list(executor.map(self.score_single, predictions, references)))

#         return {'accuracy': passed / total}