2024-04-26 14:56:23 +08:00
|
|
|
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
|
|
|
|
from mmengine.config import read_base
|
2025-01-20 19:17:38 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
with read_base():
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
|
|
|
|
from opencompass.configs.datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
|
2025-01-20 19:17:38 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
from opencompass.datasets import math_judement_preprocess
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
|
|
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2024-04-26 14:56:23 +08:00
|
|
|
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
|
|
|
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
|
|
|
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
|
|
|
from opencompass.summarizers import AllObjSummarizer
|
2024-04-26 14:56:23 +08:00
|
|
|
from opencompass.tasks import OpenICLInferTask
|
|
|
|
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
|
|
|
|
|
|
|
# -------------Prompt Settings ----------------------------------------
|
|
|
|
eng_obj_prompt = """
|
|
|
|
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Examples:
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: $2x+3$
|
|
|
|
Expression 2: $3+2x$
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: 3/2
|
|
|
|
Expression 2: 1.5
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: $x^2+2x+1$
|
|
|
|
Expression 2: $y^2+2y+1$
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[No]
|
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: $x^2+2x+1$
|
|
|
|
Expression 2: $(x+1)^2$
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: 3245/5
|
|
|
|
Expression 2: 649
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[No]
|
2024-04-26 14:56:23 +08:00
|
|
|
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: 2/(-3)
|
|
|
|
Expression 2: -2/3
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
2024-04-26 14:56:23 +08:00
|
|
|
(trivial simplifications are allowed)
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: 72 degrees
|
|
|
|
Expression 2: 72
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
2024-04-26 14:56:23 +08:00
|
|
|
(give benefit of the doubt to units)
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: 64
|
|
|
|
Expression 2: 64 square feet
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[Yes]
|
2024-04-26 14:56:23 +08:00
|
|
|
(give benefit of the doubt to units)
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
Expression 1: 64
|
2024-05-14 15:35:58 +08:00
|
|
|
Expression 2:
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
[No]
|
|
|
|
(only mark as equivalent if both expressions are nonempty)
|
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
---
|
2024-04-28 21:58:58 +08:00
|
|
|
|
2024-04-26 14:56:23 +08:00
|
|
|
YOUR TASK
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
|
2024-04-26 14:56:23 +08:00
|
|
|
Expression 1: {obj_gold}
|
2024-05-14 15:35:58 +08:00
|
|
|
Expression 2: {prediction}
|
2024-04-28 21:58:58 +08:00
|
|
|
|
|
|
|
"""
|
2024-04-26 14:56:23 +08:00
|
|
|
|
|
|
|
# -------------Inferen Stage ----------------------------------------
|
|
|
|
# eval models
|
|
|
|
models = [*hf_llama3_8b_instruct_model]
|
|
|
|
# judge models
|
|
|
|
judge_models = hf_llama3_70b_instruct_model
|
|
|
|
|
|
|
|
eng_datasets = [*math_datasets]
|
|
|
|
chn_datasets = []
|
|
|
|
datasets = eng_datasets + chn_datasets
|
|
|
|
work_dir = 'outputs/obj_all/'
|
|
|
|
|
|
|
|
for d in eng_datasets:
|
2025-01-20 19:17:38 +08:00
|
|
|
d['eval_cfg'] = dict(
|
2024-04-26 14:56:23 +08:00
|
|
|
evaluator=dict(
|
|
|
|
type=LMEvaluator,
|
2024-05-14 15:35:58 +08:00
|
|
|
# If you need to preprocess the prediction before judging,
|
2024-04-26 14:56:23 +08:00
|
|
|
# you can specify the pred_postprocessor function here
|
|
|
|
pred_postprocessor=dict(type=math_judement_preprocess),
|
|
|
|
prompt_template=dict(
|
|
|
|
type=PromptTemplate,
|
|
|
|
template=dict(round=[
|
2025-01-20 19:17:38 +08:00
|
|
|
dict(role='HUMAN', prompt=eng_obj_prompt),
|
2024-04-26 14:56:23 +08:00
|
|
|
]),
|
|
|
|
),
|
|
|
|
),
|
2024-05-14 15:35:58 +08:00
|
|
|
pred_role='BOT',
|
2024-04-26 14:56:23 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
infer = dict(
|
|
|
|
partitioner=dict(type=SizePartitioner, max_task_size=40000),
|
2025-01-20 19:17:38 +08:00
|
|
|
runner=dict(type=LocalRunner,
|
|
|
|
max_num_workers=256,
|
|
|
|
task=dict(type=OpenICLInferTask)),
|
2024-04-26 14:56:23 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
# ------------- Evaluation Configuration --------------------------------
|
|
|
|
eval = dict(
|
|
|
|
partitioner=dict(
|
2025-01-20 19:17:38 +08:00
|
|
|
type=SubjectiveSizePartitioner,
|
|
|
|
max_task_size=80000,
|
|
|
|
mode='singlescore',
|
|
|
|
models=models,
|
|
|
|
judge_models=judge_models,
|
2024-04-26 14:56:23 +08:00
|
|
|
),
|
|
|
|
runner=dict(type=LocalRunner,
|
2025-01-20 19:17:38 +08:00
|
|
|
max_num_workers=16,
|
|
|
|
task=dict(type=SubjectiveEvalTask)),
|
2024-04-26 14:56:23 +08:00
|
|
|
)
|
|
|
|
|
2025-01-20 19:17:38 +08:00
|
|
|
summarizer = dict(type=AllObjSummarizer)
|