mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix and doc
This commit is contained in:
parent
cc9761e882
commit
716c02785c
@ -2,7 +2,7 @@
|
||||
name: IFEval
|
||||
category: Instruction Following
|
||||
paper: https://arxiv.org/pdf/2311.07911
|
||||
configpath: opencompass/configs/datasets/IFEval/IFEval
|
||||
configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
|
||||
configpath_llmjudge: ''
|
||||
- nphard:
|
||||
name: NPHardEval
|
||||
@ -38,7 +38,7 @@
|
||||
name: BigCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2406.15877
|
||||
configpath: opencompass/configs/datasets/bigcodebench
|
||||
configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
|
||||
configpath_llmjudge: ''
|
||||
- calm:
|
||||
name: CaLM
|
||||
@ -56,8 +56,8 @@
|
||||
name: KOR-Bench
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2410.06526v1
|
||||
configpath: opencompass/configs/datasets/korbench
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/korbench/korbench_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
|
||||
- lawbench:
|
||||
name: LawBench
|
||||
category: Knowledge / Law
|
||||
@ -74,7 +74,7 @@
|
||||
name: LiveCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2403.07974
|
||||
configpath: opencompass/configs/datasets/livecodebench
|
||||
configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
|
||||
configpath_llmjudge: ''
|
||||
- livemathbench:
|
||||
name: LiveMathBench
|
||||
@ -104,8 +104,8 @@
|
||||
name: MuSR
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2310.16049
|
||||
configpath: opencompass/configs/datasets/musr
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/musr/musr_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
|
||||
- needlebench:
|
||||
name: NeedleBench
|
||||
category: Long Context
|
||||
@ -236,8 +236,8 @@
|
||||
name: AIME2024
|
||||
category: Examination
|
||||
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
|
||||
configpath: opencompass/configs/datasets/aime2024
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
|
||||
- anli:
|
||||
name: Adversarial NLI
|
||||
category: Reasoning
|
||||
@ -282,8 +282,8 @@
|
||||
name: BIG-Bench Hard
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2210.09261
|
||||
configpath: opencompass/configs/datasets/bbh
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/bbh/bbh_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
|
||||
- BoolQ:
|
||||
name: SuperGLUE / BoolQ
|
||||
category: Knowledge
|
||||
@ -366,8 +366,8 @@
|
||||
name: CMMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2306.09212
|
||||
configpath: opencompass/configs/datasets/cmmlu
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
|
||||
- cmnli:
|
||||
name: CLUE / CMNLI
|
||||
category: Reasoning
|
||||
@ -432,8 +432,8 @@
|
||||
name: DROP (DROP Simple Eval)
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1903.00161
|
||||
configpath: opencompass/configs/datasets/drop
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/drop/drop_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
|
||||
- ds1000:
|
||||
name: DS-1000
|
||||
category: Code
|
||||
@ -468,8 +468,8 @@
|
||||
name: GPQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2311.12022v1
|
||||
configpath: opencompass/configs/datasets/gpqa
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
|
||||
- gsm8k:
|
||||
name: GSM8K
|
||||
category: Math
|
||||
@ -492,13 +492,13 @@
|
||||
name: HellaSwag
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1905.07830
|
||||
configpath: opencompass/configs/datasets/hellaswag
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
|
||||
- humaneval:
|
||||
name: HumanEval
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2107.03374v2
|
||||
configpath: opencompass/configs/datasets/humaneval
|
||||
configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
|
||||
configpath_llmjudge: ''
|
||||
- humaneval_cn:
|
||||
name: HumanEval-CN
|
||||
@ -566,6 +566,12 @@
|
||||
paper: https://arxiv.org/pdf/2103.03874
|
||||
configpath: opencompass/configs/datasets/math
|
||||
configpath_llmjudge: ''
|
||||
- math500:
|
||||
name: MATH500
|
||||
category: Math
|
||||
paper: https://github.com/openai/prm800k
|
||||
configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
|
||||
- math401:
|
||||
name: MATH 401
|
||||
category: Math
|
||||
@ -606,8 +612,8 @@
|
||||
name: MMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2009.03300
|
||||
configpath: opencompass/configs/datasets/mmlu
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
|
||||
- mmlu_cf:
|
||||
name: MMLU-CF
|
||||
category: Understanding
|
||||
@ -618,8 +624,8 @@
|
||||
name: MMLU-Pro
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2406.01574
|
||||
configpath: opencompass/configs/datasets/mmlu_pro
|
||||
configpath_llmjudge: ''
|
||||
configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
|
||||
configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
|
||||
- mmmlu:
|
||||
name: MMMLU
|
||||
category: Language / Understanding
|
||||
|
@ -1,55 +1,4 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BoolQDatasetV2
|
||||
from opencompass.utils.text_postprocessors import (
|
||||
first_option_postprocess,
|
||||
)
|
||||
from mmengine.config import read_base
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
|
||||
|
||||
Passage: {passage}
|
||||
|
||||
Question: {question}
|
||||
|
||||
A. Yes
|
||||
B. NO
|
||||
|
||||
""".strip()
|
||||
|
||||
BoolQ_reader_cfg = dict(
|
||||
input_columns=['question', 'passage'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
BoolQ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
BoolQ_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
||||
)
|
||||
|
||||
BoolQ_datasets = [
|
||||
dict(
|
||||
abbr='BoolQ',
|
||||
type=BoolQDatasetV2,
|
||||
path='opencompass/boolq',
|
||||
reader_cfg=BoolQ_reader_cfg,
|
||||
infer_cfg=BoolQ_infer_cfg,
|
||||
eval_cfg=BoolQ_eval_cfg,
|
||||
)
|
||||
]
|
||||
with read_base():
|
||||
from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403
|
||||
|
@ -1,55 +0,0 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BoolQDatasetV2
|
||||
from opencompass.utils.text_postprocessors import (
|
||||
first_option_postprocess,
|
||||
)
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
|
||||
|
||||
Passage: {passage}
|
||||
|
||||
Question: {question}
|
||||
|
||||
A. Yes
|
||||
B. NO
|
||||
|
||||
""".strip()
|
||||
|
||||
BoolQ_reader_cfg = dict(
|
||||
input_columns=['question', 'passage'],
|
||||
output_column='label',
|
||||
)
|
||||
|
||||
BoolQ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
BoolQ_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
||||
)
|
||||
|
||||
BoolQ_datasets = [
|
||||
dict(
|
||||
abbr='BoolQ',
|
||||
type=BoolQDatasetV2,
|
||||
path='opencompass/boolq',
|
||||
reader_cfg=BoolQ_reader_cfg,
|
||||
infer_cfg=BoolQ_infer_cfg,
|
||||
eval_cfg=BoolQ_eval_cfg,
|
||||
)
|
||||
]
|
@ -164,7 +164,7 @@ for _name in bbh_free_form_sets:
|
||||
name=_name,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
@ -94,7 +94,7 @@ for category in categories:
|
||||
category=category,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
@ -72,7 +72,7 @@ math_eval_cfg = dict(
|
||||
reader_cfg=math_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
@ -61,7 +61,6 @@ def _generic_llmjudge_postprocess(judgement: str):
|
||||
def generic_llmjudge_postprocess(
|
||||
output: dict,
|
||||
output_path: str,
|
||||
metric_name='accuracy',
|
||||
) -> dict:
|
||||
judged_answers = []
|
||||
origial_responses = []
|
||||
@ -78,8 +77,8 @@ def generic_llmjudge_postprocess(
|
||||
get_logger().warning(
|
||||
f'No gold answer for {k}, use empty string as reference!')
|
||||
references.append('')
|
||||
results = get_final_results(judged_answers, references, origial_responses,
|
||||
metric_name)
|
||||
results = get_final_results(judged_answers, references, origial_responses)
|
||||
|
||||
results['details'] = output
|
||||
return results
|
||||
|
||||
|
@ -186,29 +186,17 @@ class DefaultSummarizer:
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
|
||||
group_metrics.append(default_metric)
|
||||
for metric in group_metrics:
|
||||
for dataset_abbr in sg['subsets']:
|
||||
if metric == default_metric:
|
||||
metric_default = dataset_metrics[dataset_abbr][0]
|
||||
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
|
||||
parsed_results[model_abbr][dataset_abbr][metric_default]
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
|
||||
parsed_results[model_abbr][dataset_abbr][metric]
|
||||
if need_smart_metric and len(group_metrics) > 1:
|
||||
for metric in group_metrics:
|
||||
for dataset_abbr in sg['subsets']:
|
||||
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
# if need_smart_metric and len(group_metrics) > 1:
|
||||
# for metric in group_metrics:
|
||||
# for dataset_abbr in sg['subsets']:
|
||||
# scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
# eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
# else:
|
||||
# group_metrics = [default_metric]
|
||||
# for dataset_abbr in sg['subsets']:
|
||||
# metric = dataset_metrics[dataset_abbr][0]
|
||||
# scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
# eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
group_metrics = [default_metric]
|
||||
for dataset_abbr in sg['subsets']:
|
||||
metric = dataset_metrics[dataset_abbr][0]
|
||||
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
|
||||
result = {}
|
||||
for metric in scores:
|
||||
|
@ -314,8 +314,6 @@ class OpenICLEvalTask(BaseTask):
|
||||
'Model Postprocess Task: ' +
|
||||
f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
|
||||
|
||||
# save evaluator config
|
||||
|
||||
# Save result
|
||||
out_path = get_infer_output_path(
|
||||
self.model_cfg,
|
||||
|
Loading…
Reference in New Issue
Block a user