mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Update
This commit is contained in:
parent
8fc6343119
commit
7605cc2ca4
@ -1,14 +1,24 @@
|
||||
"""
|
||||
Summary: A config for KoR-Bench Evaluation.
|
||||
Setting:
|
||||
Shot: 0-shot
|
||||
Evaluator:
|
||||
- CascadeEvaluator
|
||||
- korbenchEvaluator
|
||||
- GenericLLMEvaluator
|
||||
Repeat: 1
|
||||
Avaliable Models:
|
||||
- Instruct/Chat Models
|
||||
"""
|
||||
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.utils import xml_tag_postprocessor
|
||||
|
||||
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
@ -50,7 +60,7 @@ for category in categories:
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}' # f-string
|
||||
prompt='{prompt}' # f-string
|
||||
)
|
||||
]
|
||||
)
|
||||
@ -66,41 +76,45 @@ for category in categories:
|
||||
infer_cfg = dict(
|
||||
prompt_template=prompt_template,
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Evaluation configuration
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
type=CascadeEvaluator,
|
||||
rule_evaluator=dict(
|
||||
type=korbenchEvaluator,
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=korbenchDataset,
|
||||
path='opencompass/korbench',
|
||||
prompt_mode='0_shot',
|
||||
category=category,
|
||||
reader_cfg=reader_cfg,
|
||||
llm_evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=korbenchDataset,
|
||||
path='opencompass/korbench',
|
||||
prompt_mode='0_shot',
|
||||
category=category,
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
)
|
||||
|
||||
# Dataset
|
||||
@ -113,7 +127,7 @@ for category in categories:
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
mode='singlescore',
|
||||
n=1,
|
||||
)
|
||||
|
||||
korbench_0shot_single_datasets.append(korbench_dataset)
|
@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
|
||||
from opencompass.utils import xml_tag_postprocessor
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
@ -97,7 +96,7 @@ livereasonbench_infer_cfg = dict(
|
||||
],
|
||||
)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=16384))
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
livereasonbench_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
@ -122,23 +121,22 @@ livereasonbench_eval_cfg = dict(
|
||||
type=LiveReasonBenchDataset,
|
||||
path='opencompass/LiveReasonBench',
|
||||
reader_cfg=livereasonbench_reader_cfg,
|
||||
version='livereasonbench-20250428',
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
||||
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
livereasonbench_datasets = [
|
||||
dict(
|
||||
abbr='LiveReasonBench-20241202',
|
||||
abbr='LiveReasonBench-20250428',
|
||||
type=LiveReasonBenchDataset,
|
||||
path='opencompass/LiveReasonBench',
|
||||
reader_cfg=livereasonbench_reader_cfg,
|
||||
infer_cfg=livereasonbench_infer_cfg,
|
||||
eval_cfg=livereasonbench_eval_cfg,
|
||||
version='livereasonbench-20241202',
|
||||
mode='singlescore',
|
||||
version='livereasonbench-20250428',
|
||||
n=1
|
||||
)
|
||||
]
|
Loading…
Reference in New Issue
Block a user