fix and doc

This commit is contained in:
Myhs-phz 2025-03-19 02:03:45 +00:00
parent cc9761e882
commit 716c02785c
10 changed files with 49 additions and 164 deletions

View File

@ -2,7 +2,7 @@
name: IFEval
category: Instruction Following
paper: https://arxiv.org/pdf/2311.07911
configpath: opencompass/configs/datasets/IFEval/IFEval
configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
configpath_llmjudge: ''
- nphard:
name: NPHardEval
@ -38,7 +38,7 @@
name: BigCodeBench
category: Code
paper: https://arxiv.org/pdf/2406.15877
configpath: opencompass/configs/datasets/bigcodebench
configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
configpath_llmjudge: ''
- calm:
name: CaLM
@ -56,8 +56,8 @@
name: KOR-Bench
category: Reasoning
paper: https://arxiv.org/pdf/2410.06526v1
configpath: opencompass/configs/datasets/korbench
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/korbench/korbench_gen.py
configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
- lawbench:
name: LawBench
category: Knowledge / Law
@ -74,7 +74,7 @@
name: LiveCodeBench
category: Code
paper: https://arxiv.org/pdf/2403.07974
configpath: opencompass/configs/datasets/livecodebench
configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
configpath_llmjudge: ''
- livemathbench:
name: LiveMathBench
@ -104,8 +104,8 @@
name: MuSR
category: Reasoning
paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/musr/musr_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
- needlebench:
name: NeedleBench
category: Long Context
@ -236,8 +236,8 @@
name: AIME2024
category: Examination
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
configpath: opencompass/configs/datasets/aime2024
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
- anli:
name: Adversarial NLI
category: Reasoning
@ -282,8 +282,8 @@
name: BIG-Bench Hard
category: Reasoning
paper: https://arxiv.org/pdf/2210.09261
configpath: opencompass/configs/datasets/bbh
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/bbh/bbh_gen.py
configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
- BoolQ:
name: SuperGLUE / BoolQ
category: Knowledge
@ -366,8 +366,8 @@
name: CMMLU
category: Understanding
paper: https://arxiv.org/pdf/2306.09212
configpath: opencompass/configs/datasets/cmmlu
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
- cmnli:
name: CLUE / CMNLI
category: Reasoning
@ -432,8 +432,8 @@
name: DROP (DROP Simple Eval)
category: Understanding
paper: https://arxiv.org/pdf/1903.00161
configpath: opencompass/configs/datasets/drop
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/drop/drop_gen.py
configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
- ds1000:
name: DS-1000
category: Code
@ -468,8 +468,8 @@
name: GPQA
category: Knowledge
paper: https://arxiv.org/pdf/2311.12022v1
configpath: opencompass/configs/datasets/gpqa
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
- gsm8k:
name: GSM8K
category: Math
@ -492,13 +492,13 @@
name: HellaSwag
category: Reasoning
paper: https://arxiv.org/pdf/1905.07830
configpath: opencompass/configs/datasets/hellaswag
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
- humaneval:
name: HumanEval
category: Code
paper: https://arxiv.org/pdf/2107.03374v2
configpath: opencompass/configs/datasets/humaneval
configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
configpath_llmjudge: ''
- humaneval_cn:
name: HumanEval-CN
@ -566,6 +566,12 @@
paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math
configpath_llmjudge: ''
- math500:
name: MATH500
category: Math
paper: https://github.com/openai/prm800k
configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
- math401:
name: MATH 401
category: Math
@ -606,8 +612,8 @@
name: MMLU
category: Understanding
paper: https://arxiv.org/pdf/2009.03300
configpath: opencompass/configs/datasets/mmlu
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
- mmlu_cf:
name: MMLU-CF
category: Understanding
@ -618,8 +624,8 @@
name: MMLU-Pro
category: Understanding
paper: https://arxiv.org/pdf/2406.01574
configpath: opencompass/configs/datasets/mmlu_pro
configpath_llmjudge: ''
configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
- mmmlu:
name: MMMLU
category: Language / Understanding

View File

@ -1,55 +1,4 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
from mmengine.config import read_base
QUERY_TEMPLATE = """
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
Passage: {passage}
Question: {question}
A. Yes
B. NO
""".strip()
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]
with read_base():
from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403

View File

@ -1,55 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
Passage: {passage}
Question: {question}
A. Yes
B. NO
""".strip()
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -164,7 +164,7 @@ for _name in bbh_free_form_sets:
name=_name,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)

View File

@ -94,7 +94,7 @@ for category in categories:
category=category,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)

View File

@ -72,7 +72,7 @@ math_eval_cfg = dict(
reader_cfg=math_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)

View File

@ -61,7 +61,6 @@ def _generic_llmjudge_postprocess(judgement: str):
def generic_llmjudge_postprocess(
output: dict,
output_path: str,
metric_name='accuracy',
) -> dict:
judged_answers = []
origial_responses = []
@ -78,8 +77,8 @@ def generic_llmjudge_postprocess(
get_logger().warning(
f'No gold answer for {k}, use empty string as reference!')
references.append('')
results = get_final_results(judged_answers, references, origial_responses,
metric_name)
results = get_final_results(judged_answers, references, origial_responses)
results['details'] = output
return results

View File

@ -186,29 +186,17 @@ class DefaultSummarizer:
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
group_metrics.append(default_metric)
for metric in group_metrics:
for dataset_abbr in sg['subsets']:
if metric == default_metric:
metric_default = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
parsed_results[model_abbr][dataset_abbr][metric_default]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
parsed_results[model_abbr][dataset_abbr][metric]
if need_smart_metric and len(group_metrics) > 1:
for metric in group_metrics:
for dataset_abbr in sg['subsets']:
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
# if need_smart_metric and len(group_metrics) > 1:
# for metric in group_metrics:
# for dataset_abbr in sg['subsets']:
# scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
# eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
# else:
# group_metrics = [default_metric]
# for dataset_abbr in sg['subsets']:
# metric = dataset_metrics[dataset_abbr][0]
# scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
# eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
group_metrics = [default_metric]
for dataset_abbr in sg['subsets']:
metric = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = {}
for metric in scores:

View File

@ -314,8 +314,6 @@ class OpenICLEvalTask(BaseTask):
'Model Postprocess Task: ' +
f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
# save evaluator config
# Save result
out_path = get_infer_output_path(
self.model_cfg,