From 5faee929dbb84baa4d32754d55312c9d9f9f3bb9 Mon Sep 17 00:00:00 2001 From: liushz Date: Sat, 12 Oct 2024 19:13:06 +0800 Subject: [PATCH] [Feature] Add GaoKaoMath Dataset for Evaluation & MATH Model Eval Config (#1589) * Add GaoKaoMath Dataset * Add MATH LLM Eval * Update GAOKAO Math Eval Dataset * Update GAOKAO Math Eval Dataset --- configs/datasets/gaokao_math/README.md | 108 +++++++++++++ .../gaokao_math/gaokao_math_gen_f5fd28.py | 48 ++++++ .../math/math_0shot_llm_judge_gen_393424.py | 78 ++++++++++ .../configs/datasets/gaokao_math/README.md | 108 +++++++++++++ .../gaokao_math/gaokao_math_gen_f5fd28.py | 48 ++++++ .../math/math_0shot_llm_judge_gen_393424.py | 78 ++++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/gaokao_math.py | 144 ++++++++++++++++++ opencompass/utils/model_postprocessors.py | 13 +- 9 files changed, 622 insertions(+), 4 deletions(-) create mode 100644 configs/datasets/gaokao_math/README.md create mode 100644 configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py create mode 100644 configs/datasets/math/math_0shot_llm_judge_gen_393424.py create mode 100644 opencompass/configs/datasets/gaokao_math/README.md create mode 100644 opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py create mode 100644 opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py create mode 100644 opencompass/datasets/gaokao_math.py diff --git a/configs/datasets/gaokao_math/README.md b/configs/datasets/gaokao_math/README.md new file mode 100644 index 00000000..08253add --- /dev/null +++ b/configs/datasets/gaokao_math/README.md @@ -0,0 +1,108 @@ +# GaoKao MATH Answer Evaluation Dataset +A dataset for testing the performance of the model in the GaoKao MATH Answer Extraction task. +Now support the following format of GAOKAO math questions: +1. '单选题':Single choice question +2. '多选题':Multiple choice question +3. '填空题':Fill in the blank question, can be multiple blanks +4. '解答题':Answer question, can be multiple answers + +Sample data: +```json +[ + { + "id": "3b270bc4-570a-4d77-b122-a2fc372f7d6a", + "question": "过椭圆${x^2\\over {16}} +{ y^2 \\over {4}}=1$ %内一点$M(2,1)$ %引一条弦,使该弦被点$M$ %平分,则这条弦所在直线的方程为( ).\nA. $x+2y-4=0$ %\nB. $x-2y-4=0$ %\nC. $x+2y+4=0$ %\nD. $x-2y+4=0$ %\n\n", + "response": "本题主要考查直线与圆锥曲线.设所求直线与椭圆的一个交点为$A(x,y)$ %,由于中点$M(2,1)$ %,所以另一个交点$B$ %为$(4-x,2-y)$ %.因为$A$ %,$B$ %两点都在椭圆上,所以$x^2+4y^2=16$ %,$(4-x)^2+4(2-y)^2=16$ %,两式相减,整理可得$x+2y-4=0$ %.由于过$A$ %,$B$ %两点的直线只有一条,所以这条弦所在直线的方程为$x+2y-4=0$ %.故本题正确答案为A.\n答案是:A", + "extract_answer": "A", + "question_type": "单选题" + }, + { + "id": "d60e42d7-30ee-44f9-a94d-aff6a8127750", + "question": "若函数$f(x)$ 具有下列性质:1.定义域为$(-1,1)$ ;2.对于任意的$x,y\\in(-1,1)$ ,都有$f(x)+f(y)=f\\left({\\dfrac{x+y}{1+xy}}\\right)$ ;3.当$-1< x< 0$ 时,$f(x)>0$ ,则称函数$f(x)$ 为$δ$ 的函数$.$ 若函数$f(x)$ 为$δ$ 的函数,则以下结论正确的是$(\\quad)$\nA. $\nB. x)$ 为奇函数\nC. $\nD. x)$ 为偶函数\nE. $\nF. x)$ 为单调递减函数\nG. $\nH. x)$ 为单调递增函数\n\n", + "response": "函数$f(x)$ 为$δ$ 的函数,令$x=y=0$ ,则$f(0)+f(0)=f(0)$ ,即$f(0)=0$ ,令$y=-x$ ,则$f(x)+f(-x)=f\\left(\\dfrac{x-x}{1-{x}^{2}}\\right)=f(0)=0$ ,则$f(-x)=-f(x)$ ,即函数$f(x)$ 是奇函数,设$-1< x< y< 1$ ,则$f(x)-f(y)=f(x)+f(-y)=f\\left(\\dfrac{x-y}{1-xy}\\right)$ ,$∵-1< x< y< 1$ ,$∴-1< \\dfrac{x-y}{1-xy}< 0$ ,则$f\\left(\\dfrac{x-y}{1-xy}\\right)>0$ ,即$f(x)-f(y)>0$ ,则$f(x)>f(y)$ ,即$f(x)$ 在$(-1,1)$ 上是减函数.故选$AC.$ 本题考查函数的奇偶性和单调性的判断,注意运用定义法,考查运算能力和推理能力,属于中档题.可令$x=y=0$ ,求得$f(0)=0$ ,再令$y=-x$ 可得$f(-x)=-f(x)$ ,可得$f(x)$ 的奇偶性;再令$-1< x< y< 1$ ,运用单调性的定义,结合其偶性的定义可得其单调性.\n答案是:A; C", + "extract_answer": "A, C", + "question_type": "多选题" + }, + { + "id": "31b3f702-e60c-4a20-9a40-73bd72b92d1e", + "question": "请完成以下题目(1)曲线$$y=-5\\text{e}^{x}+3$$在点$$(0,-2)$$处的切线方程为___.(2)若曲线$$f(x)=x \\sin x+1$$在$$x=\\dfrac{ \\pi }{2}$$处的切线与直线$$ax+2y+1=0$$相互垂直,则实数$$a=$$___.\n\n", + "response": "(1)由$$y=-5\\text{e}^{x}+3$$,得$$y'=-5\\text{e}^{x}$$,所以切线的斜率$$k=y'|_{x=0}=-5$$,所以切线方程为$$y+2=-5(x-0)$$,即$$5x+y+2=0$$.(2)因为$$f'(x)= \\sin x+x \\cos x$$,所以$$f'\\left(\\dfrac{ \\pi }{2}\\right)= \\sin \\dfrac{ \\pi }{2}+\\dfrac{ \\pi }{2}\\cdot \\cos \\dfrac{ \\pi }{2}=1$$.又直线$$ax+2y+1=0$$的斜率为$$-\\dfrac{a}{2}$$,所以根据题意得$$1\\times \\left(-\\dfrac{a}{2}\\right)=-1$$,解得$$a=2$$.\n答案是:(1)$$5x+y+2=0$$ (2)$$2$$", + "extract_answer": "['(1)$$5x+y+2=0$$ (2)$$2$$']", + "question_type": "填空题" + }, + { + "id": "16878941-1772-4290-bc61-00b193d5cf70", + "question": "已知函数$f\\left( x \\right)=\\left| 2x-1 \\right|$.(1)若不等式$f\\left( x+\\frac{1}{2} \\right)\\ge 2m+1\\left( m > 0 \\right)$的解集为$\\left( -\\infty ,-2 \\right]\\bigcup \\left[ 2,+\\infty \\right)$,求实数$m$的值;(2)若不等式$f\\left( x \\right)\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}+\\left| 2x+3 \\right|$对任意的实数$x,y\\in R$恒成立,求实数$a$的最小值.\n\n", + "response": "(1)直接写出不等式,解含有绝对值的函数不等式即可;(2)这是恒成立求参的问题,根据绝对值三角不等式得到左侧函数的最值,再结合均值不等式得最值.(1)由条件得$\\left| 2x \\right|\\le 2m+1$得$-m-\\frac{1}{2}\\le x\\le m+\\frac{1}{2}$,所以$m=\\frac{3}{2}$.(2)原不等式等价于$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}$,而$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le \\left| \\left( 2x-1 \\right)-\\left( 2x+3 \\right) \\right|=4$,所以${{2}^{y}}+\\frac{a}{{{2}^{y}}}\\ge 4$,则$a\\ge {{\\left[ {{2}^{y}}\\left( 4-{{2}^{y}} \\right) \\right]}_{\\text{max}}}=4$,当且仅当$y=1$时取得.\n答案是:(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$.", + "extract_answer": [ + "(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$." + ], + "question_type": "解答题" + } +] +``` +## How to use + +### 1. Prepare the dataset +```bash +cd opencompass +cp -rf /cpfs01/shared/public/liuhongwei/data/gaokao_math_dataset/gaokao_math ./data +``` +📢:If you want to evaluate your own gaokao math data, replace the `test_v2.jsonl` with your own data, but follow the format above. + +### 2. Set the evaluation model + +open `opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f` and set the model name and api url for evaluation, multiple urls are supported for acceleration. + +```python +... + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='EVALUATE_MODEL_NAME', url=['http://0.0.0.0:23333/v1', 'http://...'])) + +... + +``` +We recommand `Qwen2.5-72B-Instruct` model for evaluation. + + +### 3. Set Extractor model and run the evaluation + +```python +from mmengine.config import read_base +from opencompass.models import HuggingFacewithChatTemplate + + +with read_base(): + from opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f import gaokao_math_datasets + + +trained_qwen2_1_5b_model = [ # trained extractor model + dict( + type=HuggingFacewithChatTemplate, + abbr='gaokao_math_extractor_1_5b_v02', + path='/cpfs01/shared/public/liuhongwei/models/gaokao_math_trained/gaokao_math_extractor_1_5b_v02', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], []) +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +... +``` + +### 4. Run the evaluation + +```bash +python run.py eval.py --dump-eval-details # eval and dump the evaluation details to `results` folder +``` + + +### 5. Evaluation results + +| Evaluator / Extractor | Qwen2.5-72B-Instruct | gaokao_math_extractor_1.5b_v0.2 | +|-----------------------|-----------------------|----------------------------------| +| Qwen2.5-72B-Instruct (ACC) | 95.85 | 95.2 | diff --git a/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py b/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py new file mode 100644 index 00000000..80ae4264 --- /dev/null +++ b/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator + + +MATH_CN_PROMPT=""" +你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 +对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +""" + +gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer') + + +gaokao_math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=MATH_CN_PROMPT), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='Qwen/Qwen2.5-72B-Instruct', url=['http://22.8.73.119:23333/v1', 'http://22.8.4.97:23333/v1', 'http://22.8.22.254:23333/v1', 'http://22.8.17.14:23333/v1'])) + +gaokao_math_datasets = [ + dict( + type=GaoKaoMATHDataset, + abbr='GaoKaoMATH', + path='./data/gaokao_math/test_2k.json', + reader_cfg=gaokao_math_reader_cfg, + infer_cfg=gaokao_math_infer_cfg, + eval_cfg=gaokao_math_eval_cfg) +] diff --git a/configs/datasets/math/math_0shot_llm_judge_gen_393424.py b/configs/datasets/math/math_0shot_llm_judge_gen_393424.py new file mode 100644 index 00000000..eb302c85 --- /dev/null +++ b/configs/datasets/math/math_0shot_llm_judge_gen_393424.py @@ -0,0 +1,78 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + +## Model api url +xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) +elif post_func == 'xfinder_model': + pred_postprocessor = dict( + type=xfinder_postprocess, + question_type='math', + model_name='xFinder-qwen1505', + num_processes=128, + api_url=xfinder_url, + ) +elif post_func == 'naive_model': + pred_postprocessor = dict( + type=naive_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name=naive_model_name, + num_processes=64, + api_url=naive_model_url, + ) + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + ) + +math_eval_cfg = dict( + evaluator=evaluator, pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/gaokao_math/README.md b/opencompass/configs/datasets/gaokao_math/README.md new file mode 100644 index 00000000..08253add --- /dev/null +++ b/opencompass/configs/datasets/gaokao_math/README.md @@ -0,0 +1,108 @@ +# GaoKao MATH Answer Evaluation Dataset +A dataset for testing the performance of the model in the GaoKao MATH Answer Extraction task. +Now support the following format of GAOKAO math questions: +1. '单选题':Single choice question +2. '多选题':Multiple choice question +3. '填空题':Fill in the blank question, can be multiple blanks +4. '解答题':Answer question, can be multiple answers + +Sample data: +```json +[ + { + "id": "3b270bc4-570a-4d77-b122-a2fc372f7d6a", + "question": "过椭圆${x^2\\over {16}} +{ y^2 \\over {4}}=1$ %内一点$M(2,1)$ %引一条弦,使该弦被点$M$ %平分,则这条弦所在直线的方程为( ).\nA. $x+2y-4=0$ %\nB. $x-2y-4=0$ %\nC. $x+2y+4=0$ %\nD. $x-2y+4=0$ %\n\n", + "response": "本题主要考查直线与圆锥曲线.设所求直线与椭圆的一个交点为$A(x,y)$ %,由于中点$M(2,1)$ %,所以另一个交点$B$ %为$(4-x,2-y)$ %.因为$A$ %,$B$ %两点都在椭圆上,所以$x^2+4y^2=16$ %,$(4-x)^2+4(2-y)^2=16$ %,两式相减,整理可得$x+2y-4=0$ %.由于过$A$ %,$B$ %两点的直线只有一条,所以这条弦所在直线的方程为$x+2y-4=0$ %.故本题正确答案为A.\n答案是:A", + "extract_answer": "A", + "question_type": "单选题" + }, + { + "id": "d60e42d7-30ee-44f9-a94d-aff6a8127750", + "question": "若函数$f(x)$ 具有下列性质:1.定义域为$(-1,1)$ ;2.对于任意的$x,y\\in(-1,1)$ ,都有$f(x)+f(y)=f\\left({\\dfrac{x+y}{1+xy}}\\right)$ ;3.当$-1< x< 0$ 时,$f(x)>0$ ,则称函数$f(x)$ 为$δ$ 的函数$.$ 若函数$f(x)$ 为$δ$ 的函数,则以下结论正确的是$(\\quad)$\nA. $\nB. x)$ 为奇函数\nC. $\nD. x)$ 为偶函数\nE. $\nF. x)$ 为单调递减函数\nG. $\nH. x)$ 为单调递增函数\n\n", + "response": "函数$f(x)$ 为$δ$ 的函数,令$x=y=0$ ,则$f(0)+f(0)=f(0)$ ,即$f(0)=0$ ,令$y=-x$ ,则$f(x)+f(-x)=f\\left(\\dfrac{x-x}{1-{x}^{2}}\\right)=f(0)=0$ ,则$f(-x)=-f(x)$ ,即函数$f(x)$ 是奇函数,设$-1< x< y< 1$ ,则$f(x)-f(y)=f(x)+f(-y)=f\\left(\\dfrac{x-y}{1-xy}\\right)$ ,$∵-1< x< y< 1$ ,$∴-1< \\dfrac{x-y}{1-xy}< 0$ ,则$f\\left(\\dfrac{x-y}{1-xy}\\right)>0$ ,即$f(x)-f(y)>0$ ,则$f(x)>f(y)$ ,即$f(x)$ 在$(-1,1)$ 上是减函数.故选$AC.$ 本题考查函数的奇偶性和单调性的判断,注意运用定义法,考查运算能力和推理能力,属于中档题.可令$x=y=0$ ,求得$f(0)=0$ ,再令$y=-x$ 可得$f(-x)=-f(x)$ ,可得$f(x)$ 的奇偶性;再令$-1< x< y< 1$ ,运用单调性的定义,结合其偶性的定义可得其单调性.\n答案是:A; C", + "extract_answer": "A, C", + "question_type": "多选题" + }, + { + "id": "31b3f702-e60c-4a20-9a40-73bd72b92d1e", + "question": "请完成以下题目(1)曲线$$y=-5\\text{e}^{x}+3$$在点$$(0,-2)$$处的切线方程为___.(2)若曲线$$f(x)=x \\sin x+1$$在$$x=\\dfrac{ \\pi }{2}$$处的切线与直线$$ax+2y+1=0$$相互垂直,则实数$$a=$$___.\n\n", + "response": "(1)由$$y=-5\\text{e}^{x}+3$$,得$$y'=-5\\text{e}^{x}$$,所以切线的斜率$$k=y'|_{x=0}=-5$$,所以切线方程为$$y+2=-5(x-0)$$,即$$5x+y+2=0$$.(2)因为$$f'(x)= \\sin x+x \\cos x$$,所以$$f'\\left(\\dfrac{ \\pi }{2}\\right)= \\sin \\dfrac{ \\pi }{2}+\\dfrac{ \\pi }{2}\\cdot \\cos \\dfrac{ \\pi }{2}=1$$.又直线$$ax+2y+1=0$$的斜率为$$-\\dfrac{a}{2}$$,所以根据题意得$$1\\times \\left(-\\dfrac{a}{2}\\right)=-1$$,解得$$a=2$$.\n答案是:(1)$$5x+y+2=0$$ (2)$$2$$", + "extract_answer": "['(1)$$5x+y+2=0$$ (2)$$2$$']", + "question_type": "填空题" + }, + { + "id": "16878941-1772-4290-bc61-00b193d5cf70", + "question": "已知函数$f\\left( x \\right)=\\left| 2x-1 \\right|$.(1)若不等式$f\\left( x+\\frac{1}{2} \\right)\\ge 2m+1\\left( m > 0 \\right)$的解集为$\\left( -\\infty ,-2 \\right]\\bigcup \\left[ 2,+\\infty \\right)$,求实数$m$的值;(2)若不等式$f\\left( x \\right)\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}+\\left| 2x+3 \\right|$对任意的实数$x,y\\in R$恒成立,求实数$a$的最小值.\n\n", + "response": "(1)直接写出不等式,解含有绝对值的函数不等式即可;(2)这是恒成立求参的问题,根据绝对值三角不等式得到左侧函数的最值,再结合均值不等式得最值.(1)由条件得$\\left| 2x \\right|\\le 2m+1$得$-m-\\frac{1}{2}\\le x\\le m+\\frac{1}{2}$,所以$m=\\frac{3}{2}$.(2)原不等式等价于$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}$,而$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le \\left| \\left( 2x-1 \\right)-\\left( 2x+3 \\right) \\right|=4$,所以${{2}^{y}}+\\frac{a}{{{2}^{y}}}\\ge 4$,则$a\\ge {{\\left[ {{2}^{y}}\\left( 4-{{2}^{y}} \\right) \\right]}_{\\text{max}}}=4$,当且仅当$y=1$时取得.\n答案是:(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$.", + "extract_answer": [ + "(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$." + ], + "question_type": "解答题" + } +] +``` +## How to use + +### 1. Prepare the dataset +```bash +cd opencompass +cp -rf /cpfs01/shared/public/liuhongwei/data/gaokao_math_dataset/gaokao_math ./data +``` +📢:If you want to evaluate your own gaokao math data, replace the `test_v2.jsonl` with your own data, but follow the format above. + +### 2. Set the evaluation model + +open `opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f` and set the model name and api url for evaluation, multiple urls are supported for acceleration. + +```python +... + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='EVALUATE_MODEL_NAME', url=['http://0.0.0.0:23333/v1', 'http://...'])) + +... + +``` +We recommand `Qwen2.5-72B-Instruct` model for evaluation. + + +### 3. Set Extractor model and run the evaluation + +```python +from mmengine.config import read_base +from opencompass.models import HuggingFacewithChatTemplate + + +with read_base(): + from opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f import gaokao_math_datasets + + +trained_qwen2_1_5b_model = [ # trained extractor model + dict( + type=HuggingFacewithChatTemplate, + abbr='gaokao_math_extractor_1_5b_v02', + path='/cpfs01/shared/public/liuhongwei/models/gaokao_math_trained/gaokao_math_extractor_1_5b_v02', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], []) +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +... +``` + +### 4. Run the evaluation + +```bash +python run.py eval.py --dump-eval-details # eval and dump the evaluation details to `results` folder +``` + + +### 5. Evaluation results + +| Evaluator / Extractor | Qwen2.5-72B-Instruct | gaokao_math_extractor_1.5b_v0.2 | +|-----------------------|-----------------------|----------------------------------| +| Qwen2.5-72B-Instruct (ACC) | 95.85 | 95.2 | diff --git a/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py b/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py new file mode 100644 index 00000000..80ae4264 --- /dev/null +++ b/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator + + +MATH_CN_PROMPT=""" +你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 +对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +""" + +gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer') + + +gaokao_math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=MATH_CN_PROMPT), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='Qwen/Qwen2.5-72B-Instruct', url=['http://22.8.73.119:23333/v1', 'http://22.8.4.97:23333/v1', 'http://22.8.22.254:23333/v1', 'http://22.8.17.14:23333/v1'])) + +gaokao_math_datasets = [ + dict( + type=GaoKaoMATHDataset, + abbr='GaoKaoMATH', + path='./data/gaokao_math/test_2k.json', + reader_cfg=gaokao_math_reader_cfg, + infer_cfg=gaokao_math_infer_cfg, + eval_cfg=gaokao_math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py b/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py new file mode 100644 index 00000000..eb302c85 --- /dev/null +++ b/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py @@ -0,0 +1,78 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + +## Model api url +xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) +elif post_func == 'xfinder_model': + pred_postprocessor = dict( + type=xfinder_postprocess, + question_type='math', + model_name='xFinder-qwen1505', + num_processes=128, + api_url=xfinder_url, + ) +elif post_func == 'naive_model': + pred_postprocessor = dict( + type=naive_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name=naive_model_name, + num_processes=64, + api_url=naive_model_url, + ) + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + ) + +math_eval_cfg = dict( + evaluator=evaluator, pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 357a6143..dbeaca4a 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -43,6 +43,7 @@ from .eprstmt import * # noqa: F401, F403 from .FinanceIQ import * # noqa: F401, F403 from .flores import * # noqa: F401, F403 from .game24 import * # noqa: F401, F403 +from .gaokao_math import * # noqa: F401, F403 from .GaokaoBench import * # noqa: F401, F403 from .govrepcrs import * # noqa: F401, F403 from .gpqa import * # noqa: F401, F403 diff --git a/opencompass/datasets/gaokao_math.py b/opencompass/datasets/gaokao_math.py new file mode 100644 index 00000000..87840b71 --- /dev/null +++ b/opencompass/datasets/gaokao_math.py @@ -0,0 +1,144 @@ +import concurrent.futures +import json +import re + +from datasets import Dataset + +from opencompass.models import OpenAISDK +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS + +from .base import BaseDataset + +# from opencompass.utils import get_data_path + + +EVAL_PROMPT = """ +请你作为一个数学高考阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准: +1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题等,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。 +2. 有些答案可能通过不同的方式表达,比如有些答案可能是一个数学表达式,有些答案可能是一个文字描述,只要表达的意思一致即可。且有些公式通过不同的方式表达,但等价,也是正确的。 +3. 你不需要重新计算问题答案,因为标准答案已经给出,只需要根据问题形式来判断考生的答案是否与标准答案一致,是否正确即可。 + +请你根据上述标准,判断下面的答案是否与标准答案一致,如果一致,请在最后输出\\boxed{{yes}}, 否则输出\\boxed{{no}}, 如果难以判断,请输出\\boxed{{no}}. +原问题:{question} +标准答案:{gold_answer} +考生答案:{answer} + +分析: +""" # noqa E501 + + +def extract_boxed_answer(text): + match = re.findall(r'\\boxed{(.+?)}', text) + if match: + return match[-1] + return None + + +@LOAD_DATASET.register_module() +class GaoKaoMATHDataset(BaseDataset): + + @staticmethod + def load(path: str): + # path = get_data_path(path, local_mode=True) + data = json.load(open(path)) + for i in range(len(data)): + data[i]['extract_answer'] = str(data[i]['extract_answer']) + dataset = Dataset.from_list(data) + return dataset + + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +@ICL_EVALUATORS.register_module() +class GaoKaoMATHEvaluator(BaseEvaluator): + + def __init__(self, model_name, url, **kwargs): + if isinstance(url, str): + url = [url] + + self.model = [ + MODELS.build( + dict( + type=OpenAISDK, + path=model_name, + openai_api_base=url, + key='EMPTY', + query_per_second=1, + meta_template=api_meta_template, + temperature=kwargs.get('temperature', 0.01), + max_seq_len=kwargs.get('max_tokens', 8192), + )) for url in url + ] + + def batch_response(self, inputs): + batch_num = len(self.model) + batch_size = (len(inputs) + batch_num - 1) // batch_num + result_responses = [] + + with concurrent.futures.ThreadPoolExecutor( + max_workers=batch_num) as executor: + futures = [ + executor.submit(self.model[i].generate, + inputs[i * batch_size:(i + 1) * batch_size]) + for i in range(batch_num) + ] + for response in executor.map(lambda f: f.result(), futures): + result_responses.extend(response) + + return result_responses + + def score(self, predictions, references, origin_prompt): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + questions = [item[0]['prompt'] for item in origin_prompt] + count = 0 + correct = 0 + details = [] + results = [] + inputs = [] + for pred, ref, ques in zip(predictions, references, questions): + inputs.append( + EVAL_PROMPT.format(answer=pred, gold_answer=ref, + question=ques)) + + result_responses = self.batch_response(inputs) + results = [ + extract_boxed_answer(result) == 'yes' + for result in result_responses + ] + for pred, ref, result, result_response in zip(predictions, references, + results, + result_responses): + detail = { + 'pred': pred, + 'answer': ref, + 'correct': False, + 'eval_model_response': result_response + } + count += 1 + if result: + correct += 1 + detail['correct'] = True + details.append(detail) + + detailed_result = { + 'accuracy': 100 * correct / count, + 'details': details + } + + return detailed_result + + +if __name__ == '__main__': + evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1', + temperature=0.01, + max_tokens=2048, + procs=8) + predictions = ['1', '2', '3'] + references = ['1', '2', '3'] + evaluator.score(predictions, references) diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py index 760fcb7a..13690ad0 100644 --- a/opencompass/utils/model_postprocessors.py +++ b/opencompass/utils/model_postprocessors.py @@ -24,8 +24,11 @@ def gen_output_naive(ori_data, extractor): @TEXT_POSTPROCESSORS.register_module('naive') -def navie_model_postprocess(preds: list, model_name: str, - custom_instruction: str, api_url: Union[str, list], +def navie_model_postprocess(preds: list, + model_name: str, + custom_instruction: str, + api_url: Union[str, list], + num_processes: int = 8, **kwargs) -> list: """Postprocess the text extracted by custom model. Args: @@ -38,7 +41,7 @@ def navie_model_postprocess(preds: list, model_name: str, list: The postprocessed answers. """ - def _eval_pred(texts, extractor, num_processes=8): + def _eval_pred(texts, extractor, num_processes): ori_data = texts extracted_answers = [] batched_ori_data = [] @@ -60,7 +63,9 @@ def navie_model_postprocess(preds: list, model_name: str, model_name=model_name, custom_instruction=custom_instruction, url=api_url.split(',') if ',' in api_url else api_url) - calc_acc_func = partial(_eval_pred, extractor=extractor) + calc_acc_func = partial(_eval_pred, + extractor=extractor, + num_processes=num_processes) extracted_answers = calc_acc_func(format_data) return extracted_answers