[Feature] Support Math evaluation via judgemodel (#1094)

* support openai math evaluation * support openai math evaluation * support openai math evaluation * support math llm judge * support math llm judge
2025-05-30 16:03:24 +08:00 · 2024-04-26 14:56:23 +08:00 · 2024-04-26 14:56:23 +08:00 · 6ba1c4937d
commit 6ba1c4937d
parent 41196c48ae
8 changed files with 311 additions and 8 deletions
--- a/configs/datasets/math/math_llm_judge.py
+++ b/configs/datasets/math/math_llm_judge.py
@ -0,0 +1,35 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
 QUERY_TEMPLATE = """
 Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
 {problem}
 Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
 """.strip()
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(role="HUMAN", prompt=QUERY_TEMPLATE),
        ])),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
 math_datasets = [
    dict(
        type=MATHDataset,
        abbr='math',
        path='./data/math/math.json',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg)
 ]
--- a/configs/eval_math_llm_judge.py
+++ b/configs/eval_math_llm_judge.py
@ -0,0 +1,111 @@
 # Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
 from mmengine.config import read_base
 with read_base():
    from .models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
    from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model  # noqa: F401, F403
    from .models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model  # noqa: F401, F403
    from .datasets.math.math_llm_judge import math_datasets  # noqa: F401, F403
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.datasets import math_judement_preprocess
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import AllObjSummarizer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 # -------------Prompt Settings ----------------------------------------
 eng_obj_prompt = """
 Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
 Examples:
    Expression 1: $2x+3$
    Expression 2: $3+2x$
 Result: [[Correct]]
    Expression 1: 3/2
    Expression 2: 1.5
 Result: [[Correct]]
    Expression 1: $x^2+2x+1$
    Expression 2: $y^2+2y+1$
 Result: [[Incorrect]]
    Expression 1: $x^2+2x+1$
    Expression 2: $(x+1)^2$
 Result: [[Correct]]
    Expression 1: 3245/5
    Expression 2: 649
 Result: [[Incorrect]]
 (these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
    Expression 1: 2/(-3)
    Expression 2: -2/3
 Result: [[Correct]]
 (trivial simplifications are allowed)
    Expression 1: 72 degrees
    Expression 2: 72
 Result: [[Correct]]
 (give benefit of the doubt to units)
    Expression 1: 64
    Expression 2: 64 square feet
 Result: [[Correct]]
 (give benefit of the doubt to units)
 ---
 YOUR TASK
 Respond with only "Result: [[Correct]]" or "Result: [[Incorrect]]" (without quotes). Do not include a rationale.
    Expression 1: {obj_gold}
    Expression 2: {prediction}
 """.strip()
 # -------------Inferen Stage ----------------------------------------
 # eval models
 models = [*hf_llama3_8b_instruct_model]
 # judge models
 judge_models = hf_llama3_70b_instruct_model
 eng_datasets = [*math_datasets]
 chn_datasets = []
 datasets = eng_datasets + chn_datasets
 work_dir = 'outputs/obj_all/'
 for d in eng_datasets:
    d['eval_cfg']= dict(
        evaluator=dict(
            type=LMEvaluator,
            # If you need to preprocess the prediction before judging, 
            # you can specify the pred_postprocessor function here
            pred_postprocessor=dict(type=math_judement_preprocess),
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt = eng_obj_prompt
                    ),
                ]),
            ),
        ),
        pred_role="BOT",
    )
 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=40000),
    runner=dict(
        type=LocalRunner,
        max_num_workers=256,
        task=dict(type=OpenICLInferTask)),
 )
 # ------------- Evaluation Configuration --------------------------------
 eval = dict(
    partitioner=dict(
        type=SubjectiveSizePartitioner, max_task_size=80000, mode='singlescore', models=models, judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
        max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
 )
 summarizer = dict(
    type=AllObjSummarizer
 )
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@ -125,6 +125,15 @@ def normalize_final_answer(final_answer: str) -> str:
    return final_answer
 ANSWER_PATTERN = r'(?i)ANSWER\s*:\s*([^\n]+)'
 def extract_answer(response_text: str):
    # We suggest to return an empty string but not None when extract failed
    match = re.search(ANSWER_PATTERN, response_text)
    return match.group(1) if match else ''
@LOAD_DATASET.register_module()
 class MATHDataset(BaseDataset):
@ -156,6 +165,12 @@ def math_postprocess(text: str) -> str:
    #     text.split('Final Answer: ', 1)[-1].split('\n\n')[0])
@TEXT_POSTPROCESSORS.register_module('math_judement_preprocess')
 def math_judement_preprocess(text: str) -> str:
    """Preprocess prediction before judgement."""
    return extract_answer(text)
@TEXT_POSTPROCESSORS.register_module('math_postprocess_v2')
 def math_postprocess_v2(text: str) -> str:
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -12,8 +12,6 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.registry import ICL_PROMPT_TEMPLATES
 from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
 from opencompass.utils.logging import get_logger
 from opencompass.utils.text_postprocessors import first_number_postprocess
 from opencompass.utils.types import get_type_from_cfg
 def extract_dicts(data):
@ -80,7 +78,7 @@ class LMEvaluator:
        dataset_cfg (ConfigDict, optional): The config of the dataset to be
            evaluated.
        pack_all_predictions (bool, optional): For multiround evaluation, judge all round or judge every single round.
-        postprocessor (ConfigDict): The model prediction's postprocessor
+        pred_postprocessor (ConfigDict): The model prediction's postprocessor
            config.
    """
@ -92,7 +90,7 @@ class LMEvaluator:
        meta_review_prompt_template: Optional[ConfigDict] = None,
        pack_all_predictions: Optional[bool] = False,
        dataset_cfg: Optional[ConfigDict] = None,
-        postprocessor: ConfigDict = dict(type=first_number_postprocess)
+        pred_postprocessor: Optional[ConfigDict] = None,
    ) -> None:
        self.output_path = output_path
        out_dir, out_name = osp.split(output_path)
@ -112,7 +110,6 @@ class LMEvaluator:
                                        batch_size=batch_size,
                                        output_json_filepath=out_dir,
                                        output_json_filename=out_name)
        self.postprocessor = get_type_from_cfg(postprocessor)
        self.logger = get_logger()
        self.dataset_cfg = dataset_cfg
        self.pack_all_predictions = pack_all_predictions
@ -163,7 +160,9 @@ class LMEvaluator:
        ):  #single chat for format like [['xxx', 'xxxx'], ['xxx', 'xxxx']]
            for i in range(len(predictions)):
                key = 'prediction' if i == 0 else f'prediction{i + 1}'
                gold_key = 'obj_gold'
                pred_dict[key] = predictions[i]
                pred_dict[gold_key] = references
            if judgements:
                for i in range(len(judgements)):
                    key = 'judgement' if i == 0 else f'judgement{i + 1}'
@ -189,6 +188,10 @@ class LMEvaluator:
            if judgements:
                raise NotImplementedError(
                    'Not applied meta-reivew judge on multi-round dataset')
        else:
            raise NotImplementedError(
                f'{predictions[0][0]} with type {type(predictions[0][0])}, please check the postprocess you add to the prediction string is right or not, we suggest to return an empty string but not None'
            )
        if self.dataset_cfg:
            dataset = build_dataset_from_cfg(self.dataset_cfg)
--- a/opencompass/summarizers/subjective/init.py
+++ b/opencompass/summarizers/subjective/init.py
@ -1,5 +1,6 @@
 # flake8: noqa: F401, E501
 from .alignmentbench import AlignmentBenchSummarizer
 from .all_obj import AllObjSummarizer
 from .alpacaeval import AlpacaSummarizer
 from .compass_arena import CompassArenaSummarizer
 from .corev2 import Corev2Summarizer
--- a/opencompass/summarizers/subjective/all_obj.py
+++ b/opencompass/summarizers/subjective/all_obj.py
@ -0,0 +1,122 @@
 # flake8: noqa: E501
 import csv
 import os
 import os.path as osp
 import re
 from collections import defaultdict
 from datetime import datetime
 import numpy as np
 from mmengine import ConfigDict
 from prettytable import from_csv
 from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
 from .utils import get_judgeanswer_and_reference, get_outdir
 def post_process_allobj(judgement: str):
    """Input a string like below:
    xxx[[correct]]xxx, and extract the judge
    """
    pattern = r'(?i)\[(incorrect|correct|正确|错误)\]'
    matched_result = re.findall(pattern, judgement)
    if matched_result:
        content = matched_result[0].lower()
        if content in ['correct', '正确']:
            return {'score': 1}
        elif content in ['incorrect', '错误']:
            return {'score': 0}
    else:
        return None
 def get_capability_results(
    judged_answers,
    references,
    fout,
    fout_flag,
    model,
 ):
    capability_ratings = defaultdict(int)
    capability_counts = defaultdict(int)
    for ans, ref in zip(judged_answers, references):
        capability_ratings['total'] += ans['score']
        capability_counts['total'] += 1
    capability_avg_ratings = defaultdict(float)
    for capability, total_score in capability_ratings.items():
        capability_avg_ratings[
            capability] = total_score / capability_counts[capability]
    columns = list(capability_avg_ratings.keys())
    columns.insert(0, columns.pop(columns.index('total')))
    with open(fout, 'a+', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if fout_flag == 0:
            writer.writerow(['model'] + columns)
        writer.writerow([model] +
                        [capability_avg_ratings[column] for column in columns])
 class AllObjSummarizer:
    """Do the subjectivity analyze based on evaluation results.
    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
    """
    def __init__(self, config: ConfigDict, judge_type='single') -> None:
        self.judge_type = judge_type
        self.tasks = []
        self.cfg = config
        if self.judge_type == 'single':
            self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
            self.eval_model_abbrs = [
                model_abbr_from_cfg(model) for model in self.eval_model_cfgs
            ]
        elif self.judge_type == 'pair':
            self.base_models = self.cfg['eval']['partitioner']['base_models']
            self.compare_models = self.cfg['eval']['partitioner'][
                'compare_models']
        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
        self.judge_map = {'single': post_process_allobj}
        self.judge_function = self.judge_map[self.judge_type]
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
        Args:
            time_str (str): Timestamp for file naming.
        Returns:
            pd.DataFrame: The summary results.
        """
        if self.judge_type == 'single':
            dataset_cfgs = self.cfg['datasets']
            judge_model = self.judge_abbr
            output_dir, results_folder = get_outdir(self.cfg, time_str)
            for dataset in dataset_cfgs:
                dataset_abbr = dataset_abbr_from_cfg(dataset)
                fout = osp.join(
                    output_dir,
                    'judged-by--' + judge_model + '-' + dataset_abbr + '.csv')
                fout_flag = 0
                for eval_model_abbr in self.eval_model_abbrs:
                    subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
                    subdir_path = os.path.join(results_folder, subdir)
                    if os.path.isdir(subdir_path):
                        model = eval_model_abbr
                        judged_answers, references = get_judgeanswer_and_reference(
                            dataset, subdir_path, self.judge_function)
                        get_capability_results(judged_answers, references,
                                               fout, fout_flag, model)
                        fout_flag += 1
                    else:
                        print(subdir_path + ' is not exist! please check!')
            with open(fout, 'r') as f:
                x = from_csv(f)
            print(x)
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@ -139,7 +139,8 @@ class SubjectiveEvalTask(BaseTask):
        # If no predictions get in predictions dir
        assert osp.exists(filename) or osp.exists(
            osp.realpath(partial_filename)
-        ), 'No predictions found for {filename}.'.format(filename=filename)
+        ), 'No predictions found for {filename} and {partial_filename}'.format(
            filename=filename, partial_filename=partial_filename)
        # If use Naive partition in infer stage
        if osp.exists(osp.realpath(filename)):
@ -188,10 +189,14 @@ class SubjectiveEvalTask(BaseTask):
            if fnmatch.fnmatch(ds_abbr, pattern):
                pred_postprocessor = model_postprocessors[pattern]
                break
-        if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
+        if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
-            kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
+            kwargs = pred_postprocessor or eval_cfg['evaluator'][
                'pred_postprocessor']
            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
            self.logger.info('Get postprocessor {postprocessor}.')
            pred_strs = [proc(s, **kwargs) for s in pred_strs]
        else:
            self.logger.info('No postprocessor found.')
        return {
            'model_name': model_abbr_from_cfg(model_cfg),
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -77,6 +77,17 @@ def get_config_from_arg(args) -> Config:
        if args.accelerator in ['vllm', 'lmdeploy']:
            config['models'] = change_accelerator(config['models'],
                                                  args.accelerator)
            if 'eval' in config and 'partitioner' in config['eval']:
                if 'models' in config['eval']['partitioner']:
                    config['eval']['partitioner'][
                        'models'] = change_accelerator(
                            config['eval']['partitioner']['models'],
                            args.accelerator)
                if 'judge_models' in config['eval']['partitioner']:
                    config['eval']['partitioner'][
                        'judge_models'] = change_accelerator(
                            config['eval']['partitioner']['judge_models'],
                            args.accelerator)
        return config
    # parse dataset args
    if not args.datasets and not args.custom_dataset_path: