# flake8: noqa import json import os.path as osp import re import statistics from typing import Optional from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import get_judgeanswer_and_reference @LOAD_DATASET.register_module() class FollowBenchDataset(BaseDataset): def load(self, path: str, name: str, cate: str, *args, **kwargs): path = get_data_path(path, local_mode=True) filename = osp.join(path, f'{name}.json') raw_data = [] raw_data = [] with open(filename, 'r', encoding='utf-8') as f: data = json.load(f) for item in data: if cate == 'llm': raw_data.append({ 'instruction': item['instruction'], 'judge_prompt': item['judge_prompt'], 'judge': item }) # elif cate == 'rule': # raw_data.append({ # 'instruction': item['instruction'], # 'judge': item # }) else: raise NotImplementedError( f"Category '{cate}' is not implemented.") dataset = Dataset.from_list(raw_data) return dataset def post_process_followbench(item): generation, level = item['prediction'], item['gold']['level'] try: satisfy = generation.strip('```').strip().split('\n')[-1] if level == 1: if 'YES' in satisfy: return 1, 1 elif 'NO' in satisfy: return 0, 0 else: raise Exception('Invalid evaluation for level 1.') else: satisfy_list = re.search(r'\[.*\]', satisfy) if satisfy_list: satisfy_list = eval(satisfy_list.group()) if len(satisfy_list) == level: num_true = 0 for i in satisfy_list: if i == 'YES' or i == 'True': num_true += 1 elif i in [ 'NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN', 'N/A' ]: num_true += 0 else: raise Exception('Invalid element in the list.') return int(num_true == level), num_true / level else: raise Exception('Invalid number of elements in the list.') else: raise Exception('Invalid list that cannot be parsed.') except Exception as e: return -1, -1 def get_scores(judged_answers, references): results = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] n_group = len(judged_answers) // 5 n_groups = [n_group] * 5 for judged_answer, reference in zip(judged_answers, references): if judged_answer[0] == -1: n_groups[reference['level'] - 1] -= 1 else: results[0][reference['level'] - 1] += judged_answer[0] results[1][reference['level'] - 1] += judged_answer[1] for i in range(len(results)): for j in range(len(results[i])): if n_groups[j] != 0: results[i][j] = results[i][j] / n_groups[j] else: results[i][j] = 0 temp_dict = { 'HSR_AVG': statistics.mean(results[0]), 'SSR_AVG': statistics.mean(results[1]) } for idx, s in enumerate(results[0]): temp_dict[f'HSR_L{idx+1}'] = s for idx, s in enumerate(results[1]): temp_dict[f'SSR_L{idx+1}'] = s return temp_dict @DICT_POSTPROCESSORS.register_module('followbench') def followbench_postprocess( output: dict, output_path: str, ) -> dict: judged_answers, references = get_judgeanswer_and_reference( output, output_path, post_process_followbench) results = get_scores(judged_answers, references) results['details'] = output return results # def check_match(target, generation): # pattern_str = target.replace('{{', # '{').replace('}}', # '}').replace('{answer}', '.*') # pattern_str = re.escape(pattern_str).replace('\\.\\*', '.*') # match = re.fullmatch(pattern_str, generation) # return bool(match) # class FollowBenchEvaluator(BaseEvaluator): # """Evaluator for followbench rule-based eval.""" # def __init__(self, num_workers=16) -> None: # self.num_workers = num_workers # def score_single(self, pred, refer): # generated_code = pred # # get current dir because we will enter a temp dir to # # execute generated code # cwd = os.getcwd() # def chdir_return(cwd, return_value): # os.chdir(cwd) # return return_value # # we create a tempdir to execute each generated program # with tempfile.TemporaryDirectory() as tempdir_name: # tempdir_name = Path(tempdir_name) # # copy all files and data dependencies from # shutil.copytree(refer['problem_path'], # tempdir_name, # dirs_exist_ok=True) # # generated outputs will be put into `result` # os.mkdir(tempdir_name / 'result') # program = refer['code_context'].replace('[insert]', generated_code) # with open(tempdir_name / 'program.py', 'w', encoding='UTF-8') as f: # f.write(program) # # enter into the tempdir to execute # os.chdir(tempdir_name) # execution_status = [] # # a question may not have test case but we can still execute and # # see if there is error # test_cnt = max(1, int(refer['test_case_cnt'])) # for i in range(1, test_cnt + 1): # # notice this command, e.g., you may need to # # replace `python` with `python3` # cmd_text = f'python program.py --test_case {i}' # time_limit = 60 # should not change the official time_limit # cmd = Command(cmd_text, ) # exit_code = cmd.run( # timeout=time_limit) # 0 if there is no error # execution_status.append(exit_code) # # return if execution error # if sum(execution_status) > 0: # return chdir_return(cwd, False) # # loading testing code as a module # test_module = import_source_file(tempdir_name / 'test_code.py', # 'test_code') # pass_flag = True # if int(refer['test_type']) == 3: # # stringTest parses the generated code into AST # # and check AST components # # if there is static error, stringTest may raise an exception # generated_code = generated_code.split('\n') # for line in generated_code: # if 'print' in line and '#' not in line.split('print'): # generated_code.remove(line) # generated_code = '\n'.join(generated_code) # try: # pass_flag = test_module.stringTest(generated_code) # except Exception: # # return False if stringTest error # return chdir_return(cwd, False) # test_cnt = max(int(refer['test_case_cnt']), 1) # for i in range(1, test_cnt + 1): # try: # ans = pickle.load(open(f'ans/ans{i}.pkl', 'rb')) # # loading the generated output might still raise Exception # # if the generated code is not correct # result = pickle.load(open(f'result/result_{i}.pkl', 'rb')) # pass_flag = test_module.test(result, ans) == 1 # except Exception: # # return False if stringTest error # return chdir_return(cwd, False) # return chdir_return(cwd, pass_flag) # def score(self, predictions, references): # results = {'example': {'accuracy': [0, 0, 0, 0, 0], 'num': 0}} # for prediction, reference in zip(predictions, references): # if reference['category'] == 'example': # results['example']['num'] += 1 # template = reference['target'].replace('{instruction}\n', '') # match_result = check_match(template, prediction) # print(match_result) # results['example']['accuracy'][reference['level'] - # 1] += match_result # results['example']['accuracy'] = [ # round(acc / (results['example']['num'] // 5), 2) # for acc in results['example']['accuracy'] # ] # ######## Still not finished for rule-based evaluation # # Each process changes cwd, need to use multi-processing # with ProcessPoolExecutor(self.num_workers) as executor: # passed = sum( # list(executor.map(self.score_single, predictions, references))) # return {'accuracy': passed / total}