diff --git a/opencompass/configs/datasets/TheoremQA/ThroremQA_0shot_cot_gen_8acdf7.py b/opencompass/configs/datasets/TheoremQA/ThroremQA_0shot_cot_gen_8acdf7.py new file mode 100644 index 00000000..0f12be57 --- /dev/null +++ b/opencompass/configs/datasets/TheoremQA/ThroremQA_0shot_cot_gen_8acdf7.py @@ -0,0 +1,57 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import ( + TheoremQADataset, + TheoremQA_postprocess_v3, + TheoremQA_postprocess_v4, + TheoremQAEvaluatorV3, +) + +TheoremQA_reader_cfg = dict( + input_columns=['Question', 'Answer_type'], + output_column='Answer', + train_split='test', +) + +TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms: +1. a numerical value like 0.1, no symbol and no unit at all. +2. a list of number like [2, 3, 4]. +3. True/False. +4. an option like (a), (b), (c), (d) +""" +TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step." + +TheoremQA_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=TheoremQA_prompt1 + TheoremQA_prompt2, + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# 正确的 evaluator 需要借助于 llm 来进行答案提取,此评测逻辑亦会有较多 FN 。 +TheoremQA_eval_cfg = dict( + evaluator=dict(type=TheoremQAEvaluatorV3), + pred_postprocessor=dict(type=TheoremQA_postprocess_v4), +) + +TheoremQA_datasets = [ + dict( + abbr='TheoremQA', + type=TheoremQADataset, + path='./data/TheoremQA/test.csv', + reader_cfg=TheoremQA_reader_cfg, + infer_cfg=TheoremQA_infer_cfg, + eval_cfg=TheoremQA_eval_cfg, + ) +] diff --git a/opencompass/datasets/TheoremQA/__init__.py b/opencompass/datasets/TheoremQA/__init__.py index 6682cabd..a24a0ea1 100644 --- a/opencompass/datasets/TheoremQA/__init__.py +++ b/opencompass/datasets/TheoremQA/__init__.py @@ -1,4 +1,4 @@ from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2, TheoremQADataset) from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3, - TheoremQAEvaluatorV3) + TheoremQAEvaluatorV3, TheoremQA_postprocess_v4) diff --git a/opencompass/datasets/TheoremQA/main.py b/opencompass/datasets/TheoremQA/main.py index 4500d09d..7e06792e 100644 --- a/opencompass/datasets/TheoremQA/main.py +++ b/opencompass/datasets/TheoremQA/main.py @@ -30,6 +30,13 @@ def TheoremQA_postprocess_v3(text: str) -> str: answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) return answer +def TheoremQA_postprocess_v4(text: str) -> str: + # First clean the answer text + answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) + # Remove LaTeX delimiters \( and \) and strip whitespace + answer = answer.strip('\\(').strip('\\)').strip() + return answer + @ICL_EVALUATORS.register_module() class TheoremQAEvaluatorV3(BaseEvaluator):