This commit is contained in:
zhangsongyang 2025-04-28 15:43:05 +00:00
parent 8fc6343119
commit 7605cc2ca4
2 changed files with 55 additions and 43 deletions

View File

@ -1,14 +1,24 @@
"""
Summary: A config for KoR-Bench Evaluation.
Setting:
Shot: 0-shot
Evaluator:
- CascadeEvaluator
- korbenchEvaluator
- GenericLLMEvaluator
Repeat: 1
Avaliable Models:
- Instruct/Chat Models
"""
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
from opencompass.datasets import generic_llmjudge_postprocess from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.utils import xml_tag_postprocessor
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle'] categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
GRADER_TEMPLATE = """ GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
@ -50,7 +60,7 @@ for category in categories:
round=[ round=[
dict( dict(
role='HUMAN', role='HUMAN',
prompt='{prompt}' # f-string prompt='{prompt}' # f-string
) )
] ]
) )
@ -66,41 +76,45 @@ for category in categories:
infer_cfg = dict( infer_cfg = dict(
prompt_template=prompt_template, prompt_template=prompt_template,
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024), inferencer=dict(type=GenInferencer),
) )
# Evaluation configuration # Evaluation configuration
eval_cfg = dict( eval_cfg = dict(
evaluator=dict( evaluator=dict(
type=GenericLLMEvaluator, type=CascadeEvaluator,
prompt_template=dict( rule_evaluator=dict(
type=PromptTemplate, type=korbenchEvaluator,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
), ),
dataset_cfg=dict( llm_evaluator=dict(
type=korbenchDataset, type=GenericLLMEvaluator,
path='opencompass/korbench', prompt_template=dict(
prompt_mode='0_shot', type=PromptTemplate,
category=category, template=dict(
reader_cfg=reader_cfg, begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=korbenchDataset,
path='opencompass/korbench',
prompt_mode='0_shot',
category=category,
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
), ),
judge_cfg=dict(), )
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
),
pred_role='BOT',
) )
# Dataset # Dataset
@ -113,7 +127,7 @@ for category in categories:
reader_cfg=reader_cfg, reader_cfg=reader_cfg,
infer_cfg=infer_cfg, infer_cfg=infer_cfg,
eval_cfg=eval_cfg, eval_cfg=eval_cfg,
mode='singlescore', n=1,
) )
korbench_0shot_single_datasets.append(korbench_dataset) korbench_0shot_single_datasets.append(korbench_dataset)

View File

@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
from opencompass.utils import xml_tag_postprocessor
GRADER_TEMPLATE = """ GRADER_TEMPLATE = """
@ -97,7 +96,7 @@ livereasonbench_infer_cfg = dict(
], ],
)), )),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=16384)) inferencer=dict(type=GenInferencer))
livereasonbench_eval_cfg = dict( livereasonbench_eval_cfg = dict(
evaluator=dict( evaluator=dict(
@ -122,23 +121,22 @@ livereasonbench_eval_cfg = dict(
type=LiveReasonBenchDataset, type=LiveReasonBenchDataset,
path='opencompass/LiveReasonBench', path='opencompass/LiveReasonBench',
reader_cfg=livereasonbench_reader_cfg, reader_cfg=livereasonbench_reader_cfg,
version='livereasonbench-20250428',
), ),
judge_cfg=dict(), judge_cfg=dict(),
dict_postprocessor=dict(type=livereasonbench_postprocess), dict_postprocessor=dict(type=livereasonbench_postprocess),
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
), ),
pred_role='BOT',
) )
livereasonbench_datasets = [ livereasonbench_datasets = [
dict( dict(
abbr='LiveReasonBench-20241202', abbr='LiveReasonBench-20250428',
type=LiveReasonBenchDataset, type=LiveReasonBenchDataset,
path='opencompass/LiveReasonBench', path='opencompass/LiveReasonBench',
reader_cfg=livereasonbench_reader_cfg, reader_cfg=livereasonbench_reader_cfg,
infer_cfg=livereasonbench_infer_cfg, infer_cfg=livereasonbench_infer_cfg,
eval_cfg=livereasonbench_eval_cfg, eval_cfg=livereasonbench_eval_cfg,
version='livereasonbench-20241202', version='livereasonbench-20250428',
mode='singlescore', n=1
) )
] ]