fix and doc

This commit is contained in:
Myhs-phz 2025-03-19 02:03:45 +00:00
parent cc9761e882
commit 716c02785c
10 changed files with 49 additions and 164 deletions

View File

@ -2,7 +2,7 @@
name: IFEval name: IFEval
category: Instruction Following category: Instruction Following
paper: https://arxiv.org/pdf/2311.07911 paper: https://arxiv.org/pdf/2311.07911
configpath: opencompass/configs/datasets/IFEval/IFEval configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- nphard: - nphard:
name: NPHardEval name: NPHardEval
@ -38,7 +38,7 @@
name: BigCodeBench name: BigCodeBench
category: Code category: Code
paper: https://arxiv.org/pdf/2406.15877 paper: https://arxiv.org/pdf/2406.15877
configpath: opencompass/configs/datasets/bigcodebench configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- calm: - calm:
name: CaLM name: CaLM
@ -56,8 +56,8 @@
name: KOR-Bench name: KOR-Bench
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2410.06526v1 paper: https://arxiv.org/pdf/2410.06526v1
configpath: opencompass/configs/datasets/korbench configpath: opencompass/configs/datasets/korbench/korbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
- lawbench: - lawbench:
name: LawBench name: LawBench
category: Knowledge / Law category: Knowledge / Law
@ -74,7 +74,7 @@
name: LiveCodeBench name: LiveCodeBench
category: Code category: Code
paper: https://arxiv.org/pdf/2403.07974 paper: https://arxiv.org/pdf/2403.07974
configpath: opencompass/configs/datasets/livecodebench configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- livemathbench: - livemathbench:
name: LiveMathBench name: LiveMathBench
@ -104,8 +104,8 @@
name: MuSR name: MuSR
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2310.16049 paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr configpath: opencompass/configs/datasets/musr/musr_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
- needlebench: - needlebench:
name: NeedleBench name: NeedleBench
category: Long Context category: Long Context
@ -236,8 +236,8 @@
name: AIME2024 name: AIME2024
category: Examination category: Examination
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024 paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
configpath: opencompass/configs/datasets/aime2024 configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
- anli: - anli:
name: Adversarial NLI name: Adversarial NLI
category: Reasoning category: Reasoning
@ -282,8 +282,8 @@
name: BIG-Bench Hard name: BIG-Bench Hard
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2210.09261 paper: https://arxiv.org/pdf/2210.09261
configpath: opencompass/configs/datasets/bbh configpath: opencompass/configs/datasets/bbh/bbh_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
- BoolQ: - BoolQ:
name: SuperGLUE / BoolQ name: SuperGLUE / BoolQ
category: Knowledge category: Knowledge
@ -366,8 +366,8 @@
name: CMMLU name: CMMLU
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2306.09212 paper: https://arxiv.org/pdf/2306.09212
configpath: opencompass/configs/datasets/cmmlu configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
- cmnli: - cmnli:
name: CLUE / CMNLI name: CLUE / CMNLI
category: Reasoning category: Reasoning
@ -432,8 +432,8 @@
name: DROP (DROP Simple Eval) name: DROP (DROP Simple Eval)
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1903.00161 paper: https://arxiv.org/pdf/1903.00161
configpath: opencompass/configs/datasets/drop configpath: opencompass/configs/datasets/drop/drop_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
- ds1000: - ds1000:
name: DS-1000 name: DS-1000
category: Code category: Code
@ -468,8 +468,8 @@
name: GPQA name: GPQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2311.12022v1 paper: https://arxiv.org/pdf/2311.12022v1
configpath: opencompass/configs/datasets/gpqa configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
- gsm8k: - gsm8k:
name: GSM8K name: GSM8K
category: Math category: Math
@ -492,13 +492,13 @@
name: HellaSwag name: HellaSwag
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1905.07830 paper: https://arxiv.org/pdf/1905.07830
configpath: opencompass/configs/datasets/hellaswag configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
- humaneval: - humaneval:
name: HumanEval name: HumanEval
category: Code category: Code
paper: https://arxiv.org/pdf/2107.03374v2 paper: https://arxiv.org/pdf/2107.03374v2
configpath: opencompass/configs/datasets/humaneval configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- humaneval_cn: - humaneval_cn:
name: HumanEval-CN name: HumanEval-CN
@ -566,6 +566,12 @@
paper: https://arxiv.org/pdf/2103.03874 paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math configpath: opencompass/configs/datasets/math
configpath_llmjudge: '' configpath_llmjudge: ''
- math500:
name: MATH500
category: Math
paper: https://github.com/openai/prm800k
configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
- math401: - math401:
name: MATH 401 name: MATH 401
category: Math category: Math
@ -606,8 +612,8 @@
name: MMLU name: MMLU
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2009.03300 paper: https://arxiv.org/pdf/2009.03300
configpath: opencompass/configs/datasets/mmlu configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
- mmlu_cf: - mmlu_cf:
name: MMLU-CF name: MMLU-CF
category: Understanding category: Understanding
@ -618,8 +624,8 @@
name: MMLU-Pro name: MMLU-Pro
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2406.01574 paper: https://arxiv.org/pdf/2406.01574
configpath: opencompass/configs/datasets/mmlu_pro configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
configpath_llmjudge: '' configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
- mmmlu: - mmmlu:
name: MMMLU name: MMMLU
category: Language / Understanding category: Language / Understanding

View File

@ -1,55 +1,4 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """ with read_base():
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering. from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403
Passage: {passage}
Question: {question}
A. Yes
B. NO
""".strip()
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -1,55 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
Passage: {passage}
Question: {question}
A. Yes
B. NO
""".strip()
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -164,7 +164,7 @@ for _name in bbh_free_form_sets:
name=_name, name=_name,
), ),
judge_cfg=dict(), judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), dict_postprocessor=dict(type=generic_llmjudge_postprocess),
), ),
pred_role='BOT', pred_role='BOT',
) )

View File

@ -94,7 +94,7 @@ for category in categories:
category=category, category=category,
), ),
judge_cfg=dict(), judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), dict_postprocessor=dict(type=generic_llmjudge_postprocess),
), ),
pred_role='BOT', pred_role='BOT',
) )

View File

@ -72,7 +72,7 @@ math_eval_cfg = dict(
reader_cfg=math_reader_cfg, reader_cfg=math_reader_cfg,
), ),
judge_cfg=dict(), judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), dict_postprocessor=dict(type=generic_llmjudge_postprocess),
), ),
pred_role='BOT', pred_role='BOT',
) )

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .race_gen_69ee4f import race_datasets # noqa: F401, F403 from .race_gen_69ee4f import race_datasets # noqa: F401, F403

View File

@ -61,7 +61,6 @@ def _generic_llmjudge_postprocess(judgement: str):
def generic_llmjudge_postprocess( def generic_llmjudge_postprocess(
output: dict, output: dict,
output_path: str, output_path: str,
metric_name='accuracy',
) -> dict: ) -> dict:
judged_answers = [] judged_answers = []
origial_responses = [] origial_responses = []
@ -78,8 +77,8 @@ def generic_llmjudge_postprocess(
get_logger().warning( get_logger().warning(
f'No gold answer for {k}, use empty string as reference!') f'No gold answer for {k}, use empty string as reference!')
references.append('') references.append('')
results = get_final_results(judged_answers, references, origial_responses, results = get_final_results(judged_answers, references, origial_responses)
metric_name)
results['details'] = output results['details'] = output
return results return results

View File

@ -186,29 +186,17 @@ class DefaultSummarizer:
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else: else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
group_metrics.append(default_metric) if need_smart_metric and len(group_metrics) > 1:
for metric in group_metrics: for metric in group_metrics:
for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
if metric == default_metric: scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
metric_default = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
parsed_results[model_abbr][dataset_abbr][metric_default]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
# if need_smart_metric and len(group_metrics) > 1: else:
# for metric in group_metrics: group_metrics = [default_metric]
# for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
# scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] metric = dataset_metrics[dataset_abbr][0]
# eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
# else: eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
# group_metrics = [default_metric]
# for dataset_abbr in sg['subsets']:
# metric = dataset_metrics[dataset_abbr][0]
# scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
# eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = {} result = {}
for metric in scores: for metric in scores:

View File

@ -314,8 +314,6 @@ class OpenICLEvalTask(BaseTask):
'Model Postprocess Task: ' + 'Model Postprocess Task: ' +
f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}') f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
# save evaluator config
# Save result # Save result
out_path = get_infer_output_path( out_path = get_infer_output_path(
self.model_cfg, self.model_cfg,