This commit is contained in:
zhangsongyang 2025-04-28 15:43:05 +00:00
parent 8fc6343119
commit 7605cc2ca4
2 changed files with 55 additions and 43 deletions

View File

@ -1,14 +1,24 @@
"""
Summary: A config for KoR-Bench Evaluation.
Setting:
Shot: 0-shot
Evaluator:
- CascadeEvaluator
- korbenchEvaluator
- GenericLLMEvaluator
Repeat: 1
Avaliable Models:
- Instruct/Chat Models
"""
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.utils import xml_tag_postprocessor
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
@ -50,7 +60,7 @@ for category in categories:
round=[
dict(
role='HUMAN',
prompt='{prompt}' # f-string
prompt='{prompt}' # f-string
)
]
)
@ -66,41 +76,45 @@ for category in categories:
infer_cfg = dict(
prompt_template=prompt_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
type=CascadeEvaluator,
rule_evaluator=dict(
type=korbenchEvaluator,
),
dataset_cfg=dict(
type=korbenchDataset,
path='opencompass/korbench',
prompt_mode='0_shot',
category=category,
reader_cfg=reader_cfg,
llm_evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=korbenchDataset,
path='opencompass/korbench',
prompt_mode='0_shot',
category=category,
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
),
pred_role='BOT',
)
)
# Dataset
@ -113,7 +127,7 @@ for category in categories:
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
mode='singlescore',
n=1,
)
korbench_0shot_single_datasets.append(korbench_dataset)

View File

@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
from opencompass.utils import xml_tag_postprocessor
GRADER_TEMPLATE = """
@ -97,7 +96,7 @@ livereasonbench_infer_cfg = dict(
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=16384))
inferencer=dict(type=GenInferencer))
livereasonbench_eval_cfg = dict(
evaluator=dict(
@ -122,23 +121,22 @@ livereasonbench_eval_cfg = dict(
type=LiveReasonBenchDataset,
path='opencompass/LiveReasonBench',
reader_cfg=livereasonbench_reader_cfg,
version='livereasonbench-20250428',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=livereasonbench_postprocess),
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
),
pred_role='BOT',
)
livereasonbench_datasets = [
dict(
abbr='LiveReasonBench-20241202',
abbr='LiveReasonBench-20250428',
type=LiveReasonBenchDataset,
path='opencompass/LiveReasonBench',
reader_cfg=livereasonbench_reader_cfg,
infer_cfg=livereasonbench_infer_cfg,
eval_cfg=livereasonbench_eval_cfg,
version='livereasonbench-20241202',
mode='singlescore',
version='livereasonbench-20250428',
n=1
)
]