diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml index a1a7cfd4..46b698db 100644 --- a/.pre-commit-config-zh-cn.yaml +++ b/.pre-commit-config-zh-cn.yaml @@ -9,6 +9,7 @@ exclude: | opencompass/datasets/medbench/| opencompass/datasets/teval/| opencompass/datasets/NPHardEval/| + opencompass/datasets/TheoremQA| docs/zh_cn/advanced_guides/compassbench_intro.md ) repos: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37352846..733e4a22 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,7 @@ exclude: | opencompass/datasets/medbench/| opencompass/datasets/teval/| opencompass/datasets/NPHardEval/| + opencompass/datasets/TheoremQA| docs/zh_cn/advanced_guides/compassbench_intro.md ) repos: diff --git a/configs/datasets/TheoremQA/TheoremQA_5shot_gen_6f0af8.py b/configs/datasets/TheoremQA/TheoremQA_5shot_gen_6f0af8.py new file mode 100644 index 00000000..bc7cae9f --- /dev/null +++ b/configs/datasets/TheoremQA/TheoremQA_5shot_gen_6f0af8.py @@ -0,0 +1,45 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import TheoremQADatasetV3, TheoremQA_postprocess_v3, TheoremQAEvaluatorV3 + +with read_base(): + from .TheoremQA_few_shot_examples import examples + +num_shot = 5 +rounds = [] +for index, (query, response) in enumerate(examples[:num_shot]): + if index == 0: + desc = "You are supposed to provide a solution to a given problem.\n\n" + else: + desc = "" + rounds += [ + dict(role="HUMAN", prompt=f"{desc}Problem:\n{query}\nSolution:"), + dict(role="BOT", prompt=f"{response}") + ] +rounds += [dict(role="HUMAN", prompt="Problem:\n{Question}\nSolution:")] + +TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test", test_split="test") + +TheoremQA_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=rounds)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=["Problem:", "Problem"]), +) + +TheoremQA_eval_cfg = dict( + evaluator=dict(type=TheoremQAEvaluatorV3), + pred_postprocessor=dict(type=TheoremQA_postprocess_v3) +) + +TheoremQA_datasets = [ + dict( + abbr="TheoremQA", + type=TheoremQADatasetV3, + path="data/TheoremQA/theoremqa_test.json", + reader_cfg=TheoremQA_reader_cfg, + infer_cfg=TheoremQA_infer_cfg, + eval_cfg=TheoremQA_eval_cfg, + ) +] diff --git a/configs/datasets/TheoremQA/TheoremQA_5shot_gen_a4f581.py b/configs/datasets/TheoremQA/TheoremQA_5shot_gen_a4f581.py new file mode 100644 index 00000000..00740094 --- /dev/null +++ b/configs/datasets/TheoremQA/TheoremQA_5shot_gen_a4f581.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HFDataset, TheoremQA_postprocess_v3, TheoremQAEvaluatorV3 + +TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test", test_split="test") + +TheoremQA_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are supposed to provide a solution to a given problem.\n\n\nProblem:\nIn a 10 Gigabit Ethernet network, the average size of a frame is 1500 bytes. If a burst of noise lasting 1ms interrupts the network, how many frames are lost?'), + dict(role='BOT', prompt='Solution:\nFirst, calculate the data rate in bytes/s:\n\n10 Gigabit/s * (1 Byte / 8 bits) = 1.25 * 10^9 Bytes/s\n\nNext, calculate the data loss in bytes due to the noise:\n\n1 ms * 1.25 * 10^9 Bytes/s = 1.25 * 10^6 Bytes\n\nFinally, divide the data loss by the average frame size to get the number of frames lost:\n\n1.25 * 10^6 Bytes / 1500 Bytes/frame ≈ 833.33 frames\nThe answer is 833.33'), + dict(role='HUMAN', prompt='\nProblem:\nGiven x = 0.157, what is the value of x \\times \\frac{\\prod_{n=1}^\\infty (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}?'), + dict(role='BOT', prompt="Solution:\nTo evaluate the expression $x \\times \\frac{\\prod_{n=1}^{\\infty} (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$ given x = 0.157, we first recognize that the product in the numerator is related to the sine function through the Euler's reflection formula for the sine function, which can be expressed as:\n\n$$\\sin(x) = x \\prod_{n=1}^{\\infty} \\left(1 - \\frac{x^2}{n^2 \\pi^2}\\right)$$\n\nTherefore, the given expression simplifies to: $x \\times \\frac{\\sin(x)}{\\sin(x)}$\n\nBecause sin(x) in the numerator and denominator cancels out, the expression simplifies further to just x.\n\nSo, given x = 0.157, the value of the expression is 0.157. This result is derived from the properties of the sine function and does not require computational evaluation.\nThe answer is 0.157"), + dict(role='HUMAN', prompt='\nProblem:\nConsider the basis C of \\mathbb{R}^2 consisting of vectors u_1 = [2, 4] and u_2 = [1, -1]. If y = [8, 12], find the C-coordinate vector of y.'), + dict(role='BOT', prompt="Solution:\nThe goal is to express y as a linear comPbination of the basis vectors of C, i.e., $y = a\\cdot u_1 + b\\cdot u_2$, where a and b are the scalar coefficients that we want to find. These coefficients will form the C-coordinate vector of y, which we'll denote as $[a, b]_C$.\n\nGiven:\n- $u_1 = [2, 4]$,\n- $u_2 = [1, -1]$,\n- $y = [8, 12]$.\n\nWe need to solve the system of linear equations:\n2a + 1b = 8\n4a - 1b = 12\n\nLet's solve this system of equations to find a and b.\n\nThe solution to the system of equations is $a = \\frac{10}{3} and b = \\frac{4}{3}$. Therefore, the C-coordinate vector of y in the basis consisting of vectors u_1 = [2, 4] and u_2 = [1, -1] is $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$. \nLet's calculate the numerical value of $\\left[\x0crac{10}{3}, \x0crac{4}{3}\right]_C$ as [3.33, 1.33].\nThe answer is [3.33, 1.33]"), + dict(role='HUMAN', prompt='\nProblem:\nOne can draw a simple, connected planar graph with 200 vertices and 397 edges. Is this statement Trur or False?'), + dict(role='BOT', prompt="Solution:\nTo determine the answer, we can use Euler's formula for planar graphs, which states that for any finite, connected, planar graph, $V - E + F = 2$, where V is the number of vertices, E is the number of edges, and F is the number of faces.\n\nGiven the modified question, we have V = 200 vertices and E = 397 edges. We want to find if we can have a graph that satisfies these conditions, adhering to Euler's formula.\n\nFirst, let's rearrange Euler's formula to solve for F: F = E - V + 2\n\nSubstituting the given values: F = 397 - 200 + 2, F = 199\n\nThis means a graph with 200 vertices and 397 edges would have 199 faces. However, to determine the truth of this possibility, we should check if this graph doesn't violate any other planar graph constraints, particularly regarding the number of edges.\n\nFor a simple, connected planar graph, there's also a relationship between vertices, edges, and faces given by the inequality: $E \\leq 3V - 6$\n\nSubstituting V = 200 gives: $E \\leq 3*200 - 6 = 594$\n\nWith E = 397, the condition $E \\leq 594$ is satisfied, meaning it's theoretically possible in terms of the edge condition for a planar graph.\n\nTherefore, one can draw a simple, connected planar graph with 200 vertices and 397 edges, resulting in 199 faces, without violating the conditions for it to be planar according to both Euler's formula and the constraint on the maximum number of edges.\nThe answer is True"), + dict(role='HUMAN', prompt='\nProblem:\nGiven a finite group G, and a collection of permutations H on a set. Then (a) there always exists H such that G is isomorphic to H; (b) for any H, G is isomorphic to H; (c) G can never be isomorphic to H; (d) none of the above. Which option is correct?'), + dict(role='BOT', prompt="Solution:\nThis is based on Cayley's theorem, which states that every group G is isomorphic to a subgroup of the symmetric group acting on G. \nIn other words, for every finite group G, there exists a collection of permutations H (which in this context, can be thought of as the set of permutations representing the action of G on itself) such that G is isomorphic to H.\n\nTherefore, there always exists H such that G is isomorphic to H.\nThe answer is (a)"), + dict(role='HUMAN', prompt='\nProblem:\n{Question}'), + dict(role='BOT', prompt='Solution:\n{Answer}'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=["USER:", "ASSISTANT:", "### Instruction:", "Response:", "", "[INST]", "Problem:"]), +) + +TheoremQA_eval_cfg = dict( + evaluator=dict(type=TheoremQAEvaluatorV3), + pred_postprocessor=dict(type=TheoremQA_postprocess_v3) +) + +TheoremQA_datasets = [ + dict( + abbr="TheoremQA", + type=HFDataset, + path="TIGER-Lab/TheoremQA", + reader_cfg=TheoremQA_reader_cfg, + infer_cfg=TheoremQA_infer_cfg, + eval_cfg=TheoremQA_eval_cfg, + ) +] diff --git a/configs/datasets/TheoremQA/TheoremQA_few_shot_examples.py b/configs/datasets/TheoremQA/TheoremQA_few_shot_examples.py new file mode 100644 index 00000000..578513d2 --- /dev/null +++ b/configs/datasets/TheoremQA/TheoremQA_few_shot_examples.py @@ -0,0 +1,22 @@ +examples = [ + ( + "In a 10 Gigabit Ethernet network, the average size of a frame is 1500 bytes. If a burst of noise lasting 1ms interrupts the network, how many frames are lost?", + "First, calculate the data rate in bytes/s:\n$$10 Gigabit/s * (1 Byte / 8 bits) = 1.25 * 10^9 Bytes/s$$\nNext, calculate the data loss in bytes due to the noise:\n$$1 ms * 1.25 * 10^9 Bytes/s = 1.25 * 10^6 Bytes$$\nFinally, divide the data loss by the average frame size to get the number of frames lost:\n$$1.25 * 10^6 Bytes / 1500 Bytes/frame \\approx 833.33 frames$$\nThe answer is 833.33", + ), + ( + "Given x = 0.157, what is the value of $x \\times \\frac{\\prod_{n=1}^\\infty (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$?", + "To evaluate the expression $x \\times \\frac{\\prod_{n=1}^{\\infty} (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$ given x = 0.157, we first recognize that the product in the numerator is related to the sine function through the Euler's reflection formula for the sine function, which can be expressed as:\n$$\\sin(x) = x \\prod_{n=1}^{\\infty} \\left(1 - \\frac{x^2}{n^2 \\pi^2}\\right)$$\nTherefore, the given expression simplifies to: $x \\times \\frac{\\sin(x)}{\\sin(x)}$\nBecause sin(x) in the numerator and denominator cancels out, the expression simplifies further to just x.\nSo, given x = 0.157, the value of the expression is 0.157. This result is derived from the properties of the sine function and does not require computational evaluation.\nThe answer is 0.157", + ), + ( + "Consider the basis C of $\\mathbb{R}^2$ consisting of vectors $u_1 = [2, 4]$ and $u_2 = [1, -1]$. If $y = [8, 12]$, find the C-coordinate vector of y.", + "The goal is to express y as a linear combination of the basis vectors of C, i.e., $y = a\\cdot u_1 + b\\cdot u_2$, where a and b are the scalar coefficients that we want to find. These coefficients will form the C-coordinate vector of y, which we'll denote as $[a, b]_C$.\nGiven:\n- $u_1 = [2, 4]$,\n- $u_2 = [1, -1]$,\n- $y = [8, 12]$.\nWe need to solve the system of linear equations:\n2a + 1b = 8\n4a - 1b = 12\nLet's solve this system of equations to find a and b.\nThe solution to the system of equations is $a = \\frac{10}{3} and b = \\frac{4}{3}$. Therefore, the C-coordinate vector of y in the basis consisting of vectors $u_1 = [2, 4]$ and $u_2 = [1, -1]$ is $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$.\nLet's calculate the numerical value of $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$ as [3.33, 1.33].\nThe answer is [3.33, 1.33]", + ), + ( + "One can draw a simple, connected planar graph with 200 vertices and 397 edges. Is this statement True or False?", + "To determine the answer, we can use Euler's formula for planar graphs, which states that for any finite, connected, planar graph, $V - E + F = 2$, where V is the number of vertices, E is the number of edges, and F is the number of faces.\nGiven the modified question, we have V = 200 vertices and E = 397 edges. We want to find if we can have a graph that satisfies these conditions, adhering to Euler's formula.\nFirst, let's rearrange Euler's formula to solve for F: F = E - V + 2\nSubstituting the given values: F = 397 - 200 + 2, F = 199\nThis means a graph with 200 vertices and 397 edges would have 199 faces. However, to determine the truth of this possibility, we should check if this graph doesn't violate any other planar graph constraints, particularly regarding the number of edges.\nFor a simple, connected planar graph, there's also a relationship between vertices, edges, and faces given by the inequality: $E \\leq 3V - 6$\nSubstituting V = 200 gives: $E \\leq 3*200 - 6 = 594$\nWith E = 397, the condition $E \\leq 594$ is satisfied, meaning it's theoretically possible in terms of the edge condition for a planar graph.\nTherefore, one can draw a simple, connected planar graph with 200 vertices and 397 edges, resulting in 199 faces, without violating the conditions for it to be planar according to both Euler's formula and the constraint on the maximum number of edges.\nThe answer is True", + ), + ( + "Given a finite group G, and a collection of permutations H on a set. Then (a) there always exists H such that G is isomorphic to H; (b) for any H, G is isomorphic to H; (c) G can never be isomorphic to H; (d) none of the above. Which option is correct?", + "This is based on Cayley's theorem, which states that every group G is isomorphic to a subgroup of the symmetric group acting on G.\nIn other words, for every finite group G, there exists a collection of permutations H (which in this context, can be thought of as the set of permutations representing the action of G on itself) such that G is isomorphic to H.\nTherefore, there always exists H such that G is isomorphic to H.\nThe answer is (a)", + ), +] diff --git a/configs/datasets/TheoremQA/TheoremQA_few_shot_examples_official.py b/configs/datasets/TheoremQA/TheoremQA_few_shot_examples_official.py new file mode 100644 index 00000000..618a6512 --- /dev/null +++ b/configs/datasets/TheoremQA/TheoremQA_few_shot_examples_official.py @@ -0,0 +1,22 @@ +examples = [ + ( + 'In a 10 Gigabit Ethernet network, the average size of a frame is 1500 bytes. If a burst of noise lasting 1ms interrupts the network, how many frames are lost?', + 'First, calculate the data rate in bytes/s:\n\n10 Gigabit/s * (1 Byte / 8 bits) = 1.25 * 10^9 Bytes/s\n\nNext, calculate the data loss in bytes due to the noise:\n\n1 ms * 1.25 * 10^9 Bytes/s = 1.25 * 10^6 Bytes\n\nFinally, divide the data loss by the average frame size to get the number of frames lost:\n\n1.25 * 10^6 Bytes / 1500 Bytes/frame ≈ 833.33 frames\nThe answer is 833.33' + ), + ( + 'Given x = 0.157, what is the value of x \\times \\frac{\\prod_{n=1}^\\infty (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}?', + "To evaluate the expression $x \\times \\frac{\\prod_{n=1}^{\\infty} (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$ given x = 0.157, we first recognize that the product in the numerator is related to the sine function through the Euler's reflection formula for the sine function, which can be expressed as:\n\n$$\\sin(x) = x \\prod_{n=1}^{\\infty} \\left(1 - \\frac{x^2}{n^2 \\pi^2}\\right)$$\n\nTherefore, the given expression simplifies to: $x \\times \\frac{\\sin(x)}{\\sin(x)}$\n\nBecause sin(x) in the numerator and denominator cancels out, the expression simplifies further to just x.\n\nSo, given x = 0.157, the value of the expression is 0.157. This result is derived from the properties of the sine function and does not require computational evaluation.\nThe answer is 0.157" + ), + ( + 'Consider the basis C of \\mathbb{R}^2 consisting of vectors u_1 = [2, 4] and u_2 = [1, -1]. If y = [8, 12], find the C-coordinate vector of y.', + "The goal is to express y as a linear combination of the basis vectors of C, i.e., $y = a\\cdot u_1 + b\\cdot u_2$, where a and b are the scalar coefficients that we want to find. These coefficients will form the C-coordinate vector of y, which we'll denote as $[a, b]_C$.\n\nGiven:\n- $u_1 = [2, 4]$,\n- $u_2 = [1, -1]$,\n- $y = [8, 12]$.\n\nWe need to solve the system of linear equations:\n2a + 1b = 8\n4a - 1b = 12\n\nLet's solve this system of equations to find a and b.\n\nThe solution to the system of equations is $a = \\frac{10}{3} and b = \\frac{4}{3}$. Therefore, the C-coordinate vector of y in the basis consisting of vectors u_1 = [2, 4] and u_2 = [1, -1] is $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$. \nLet's calculate the numerical value of $\\left[\x0crac{10}{3}, \x0crac{4}{3}\right]_C$ as [3.33, 1.33].\nThe answer is [3.33, 1.33]" + ), + ( + 'One can draw a simple, connected planar graph with 200 vertices and 397 edges. Is this statement Trur or False?', + "To determine the answer, we can use Euler's formula for planar graphs, which states that for any finite, connected, planar graph, $V - E + F = 2$, where V is the number of vertices, E is the number of edges, and F is the number of faces.\n\nGiven the modified question, we have V = 200 vertices and E = 397 edges. We want to find if we can have a graph that satisfies these conditions, adhering to Euler's formula.\n\nFirst, let's rearrange Euler's formula to solve for F: F = E - V + 2\n\nSubstituting the given values: F = 397 - 200 + 2, F = 199\n\nThis means a graph with 200 vertices and 397 edges would have 199 faces. However, to determine the truth of this possibility, we should check if this graph doesn't violate any other planar graph constraints, particularly regarding the number of edges.\n\nFor a simple, connected planar graph, there's also a relationship between vertices, edges, and faces given by the inequality: $E \\leq 3V - 6$\n\nSubstituting V = 200 gives: $E \\leq 3*200 - 6 = 594$\n\nWith E = 397, the condition $E \\leq 594$ is satisfied, meaning it's theoretically possible in terms of the edge condition for a planar graph.\n\nTherefore, one can draw a simple, connected planar graph with 200 vertices and 397 edges, resulting in 199 faces, without violating the conditions for it to be planar according to both Euler's formula and the constraint on the maximum number of edges.\nThe answer is True" + ), + ( + 'Given a finite group G, and a collection of permutations H on a set. Then (a) there always exists H such that G is isomorphic to H; (b) for any H, G is isomorphic to H; (c) G can never be isomorphic to H; (d) none of the above. Which option is correct?', + "This is based on Cayley's theorem, which states that every group G is isomorphic to a subgroup of the symmetric group acting on G. \nIn other words, for every finite group G, there exists a collection of permutations H (which in this context, can be thought of as the set of permutations representing the action of G on itself) such that G is isomorphic to H.\n\nTherefore, there always exists H such that G is isomorphic to H.\nThe answer is (a)" + ) +] diff --git a/configs/datasets/TheoremQA/TheoremQA_gen.py b/configs/datasets/TheoremQA/TheoremQA_gen.py index 0824a9c2..acf8a3b0 100644 --- a/configs/datasets/TheoremQA/TheoremQA_gen.py +++ b/configs/datasets/TheoremQA/TheoremQA_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .TheoremQA_gen_7009de import TheoremQA_datasets # noqa: F401, F403 + from .TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, F403 diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py b/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_424e0a.py similarity index 100% rename from configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py rename to configs/datasets/TheoremQA/deprecated_TheoremQA_gen_424e0a.py diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py b/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_7009de.py similarity index 100% rename from configs/datasets/TheoremQA/TheoremQA_gen_7009de.py rename to configs/datasets/TheoremQA/deprecated_TheoremQA_gen_7009de.py diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py b/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_ef26ca.py similarity index 100% rename from configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py rename to configs/datasets/TheoremQA/deprecated_TheoremQA_gen_ef26ca.py diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py b/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_2c2583.py similarity index 100% rename from configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py rename to configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_2c2583.py diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py b/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_ef26ca.py similarity index 100% rename from configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py rename to configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_ef26ca.py diff --git a/configs/eval_TheoremQA.py b/configs/eval_TheoremQA.py new file mode 100644 index 00000000..818eaacd --- /dev/null +++ b/configs/eval_TheoremQA.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1_model + from .models.mistral.hf_mistral_7b_v0_2 import models as hf_mistral_7b_v0_2_model + from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model + from .models.hf_internlm.hf_internlm2_math_20b import models as hf_internlm2_math_20b_model + + from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets as datasets + +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +work_dir = 'outputs/TheoremQA-5shot' + + +# dataset version metric mode mistral-7b-v0.1-hf mistral-7b-v0.2-hf internlm2-20b-hf internlm2-math-20b-hf +# --------- --------- -------- ------ -------------------- -------------------- ------------------ ----------------------- +# TheoremQA 6f0af8 score gen 18.00 16.75 25.87 30.88 diff --git a/opencompass/datasets/TheoremQA/__init__.py b/opencompass/datasets/TheoremQA/__init__.py new file mode 100644 index 00000000..6682cabd --- /dev/null +++ b/opencompass/datasets/TheoremQA/__init__.py @@ -0,0 +1,4 @@ +from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2, + TheoremQADataset) +from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3, + TheoremQAEvaluatorV3) diff --git a/opencompass/datasets/TheoremQA.py b/opencompass/datasets/TheoremQA/legacy.py similarity index 96% rename from opencompass/datasets/TheoremQA.py rename to opencompass/datasets/TheoremQA/legacy.py index 0f5d3e60..4d30b779 100644 --- a/opencompass/datasets/TheoremQA.py +++ b/opencompass/datasets/TheoremQA/legacy.py @@ -4,7 +4,7 @@ from datasets import load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS -from .base import BaseDataset +from ..base import BaseDataset @LOAD_DATASET.register_module() diff --git a/opencompass/datasets/TheoremQA/main.py b/opencompass/datasets/TheoremQA/main.py new file mode 100644 index 00000000..9e574ddd --- /dev/null +++ b/opencompass/datasets/TheoremQA/main.py @@ -0,0 +1,66 @@ +import re +import json + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from ..base import BaseDataset +from . import utils +from tqdm import tqdm + + +@LOAD_DATASET.register_module() +class TheoremQADatasetV3(BaseDataset): + + @staticmethod + def load(path: str): + with open(path, 'r') as f: + data = json.load(f) + for item in data: + item['Answer'] = str(item['Answer']) + dataset = Dataset.from_list(data) + return dataset + + +def TheoremQA_postprocess_v3(text: str) -> str: + answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) + return answer + + +@ICL_EVALUATORS.register_module() +class TheoremQAEvaluatorV3(BaseEvaluator): + def score(self, predictions, references, test_set): + if len(predictions) != len(references): + return {"error": "preds and refrs have different length"} + + details = [] + correct, wrong = 0, 0 + for index in tqdm(range(len(predictions))): + answer = predictions[index] + groundtruth = references[index] + answer_type = test_set[index]['Answer_type'] + if answer_type in ['float', 'integer', 'bool']: + groundtruth = [groundtruth, eval(groundtruth)] + else: + groundtruth = [groundtruth, None] + if utils.compare_answer_with_groundtruth(answer, *groundtruth): + correct += 1 + is_correct = True + else: + wrong += 1 + is_correct = False + + details.append( + { + # "question": question, + # "solution": output, + "correct": groundtruth, + "pred": answer, + "is_correct": is_correct, + } + ) + + score = correct / (correct + wrong) * 100 + return {'score': score, 'details': details} diff --git a/opencompass/datasets/TheoremQA/number_utils.py b/opencompass/datasets/TheoremQA/number_utils.py new file mode 100644 index 00000000..12f6e6dc --- /dev/null +++ b/opencompass/datasets/TheoremQA/number_utils.py @@ -0,0 +1,98 @@ +import re +import math +from math import sqrt, sin, cos, log, pi, factorial, exp, e +E = 2.718 + + +def floatify(num: str): + try: + num = float(num) + if num.is_integer(): + return round(num) + else: + return num + except Exception: + return None + + +def within_eps(pred: float, gt: float): + eps = abs(gt) * 0.04 + if pred >= gt - eps and pred <= gt + eps: + return True + else: + return False + + +def clean_units(pred_str: str): + """Clean the units in the number.""" + def convert_pi_to_number(code_string): + code_string = code_string.replace('\\pi', 'π') + # Replace \pi or π not preceded by a digit or } with 3.14 + code_string = re.sub(r'(? "3*3.14" + code_string = re.sub(r'(\d)(\\?π)', r'\1*3.14', code_string) + # Handle cases where π is within braces or followed by a multiplication symbol + # This replaces "{π}" with "3.14" directly and "3*π" with "3*3.14" + code_string = re.sub(r'\{(\\?π)\}', '3.14', code_string) + code_string = re.sub(r'\*(\\?π)', '*3.14', code_string) + return code_string + + pred_str = convert_pi_to_number(pred_str) + pred_str = pred_str.replace('%', '/100') + pred_str = pred_str.replace('$', '') + pred_str = pred_str.replace('¥', '') + pred_str = pred_str.replace('°C', '') + pred_str = pred_str.replace(' C', '') + pred_str = pred_str.replace('°', '') + return pred_str + + +def number_it(num): + from latex2sympy2 import latex2sympy + if isinstance(num, (int, float)): + return num + + num = clean_units(num) + try: + num = str(latex2sympy(num)) + except Exception: + pass + + if floatify(num) is not None: + return floatify(num) + else: + try: + num = eval(num) + if isinstance(num, list) or isinstance(num, tuple): + num = num[0] + if floatify(num) is not None: + return floatify(num) + else: + return None + except Exception: + return None + + +def compare_two_numbers(p, gt): + try: + if math.isnan(p): + return False + if isinstance(gt, int): + return round(p) == gt + else: + return within_eps(pred=p, gt=gt) + except Exception: + return False + + +def compare_two_list(pred, gt): + if not isinstance(pred, list): + return False + elif len(pred) != len(gt): + return False + elif any([not isinstance(x, (int, float)) for x in pred]): + return False + else: + pred = sorted(pred) + gt = sorted(gt) + return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)]) diff --git a/opencompass/datasets/TheoremQA/utils.py b/opencompass/datasets/TheoremQA/utils.py new file mode 100644 index 00000000..a4f32b2b --- /dev/null +++ b/opencompass/datasets/TheoremQA/utils.py @@ -0,0 +1,110 @@ +import re +from .number_utils import clean_units, compare_two_numbers, compare_two_list, number_it +import contextlib +import signal + +@contextlib.contextmanager +def time_limit(seconds: float): + def signal_handler(signum, frame): + raise ValueError + + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + + +def extract_theoremqa_answer(pred: str, answer_flag: bool = True): + from latex2sympy2 import latex2sympy + + if any([option in pred.lower() for option in ['yes', 'true']]): + pred = 'True' + elif any([option in pred.lower() for option in ['no', 'false']]): + pred = 'False' + elif any([option in pred.lower() for option in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']]): + pass + else: + if answer_flag: + # Extract the numbers out of the string + pred = pred.split('=')[-1].strip() + pred = clean_units(pred) + try: + with time_limit(1): + tmp = str(latex2sympy(pred)) + pred = str(eval(tmp)) + except Exception: + if re.match(r'-?[\d\.]+\s\D+$', pred): + pred = pred.split(' ')[0] + elif re.match(r'-?[\d\.]+\s[^\s]+$', pred): + pred = pred.split(' ')[0] + else: + # desparate search over the last number + preds = re.findall(r'-?\d*\.?\d+', pred) + if(len(preds) >= 1): + pred = preds[-1] + else: + pred = '' + return pred + +def answer_clean(direct_answer_trigger_for_fewshot: tuple, pred: str): + pred = pred.strip('\n') + + # Determine if this is ICL, if so, use \n\n to split the first chunk. + ICL = False + for trigger in direct_answer_trigger_for_fewshot: + if pred.count(trigger) > 1: + ICL = True + if ICL: + pred = pred.split('\n\n')[0] + + # Split the trigger to find the answer. + preds = re.split('|'.join(direct_answer_trigger_for_fewshot), pred) + if len(preds) > 1: + answer_flag = True + pred = preds[-1] + else: + answer_flag = False + + pred = pred.strip('\n').rstrip('.').rstrip('/').strip(' ') + + pred = [extract_theoremqa_answer(pred, answer_flag)] + + # If there is no candidate in list, null is set. + if len(pred) == 0: + pred = "" + else: + if answer_flag: + # choose the first element in list ... + pred = pred[0] + else: + # choose the last e + pred = pred[-1] + + # Remove the period at the end, again! + pred = pred.rstrip('.').rstrip('/') + return pred + + + +def compare_answer_with_groundtruth(answer: str, groundtruth_str: str, groundtruth_num = None): + if groundtruth_str.lower() in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']: + return groundtruth_str.lower() in answer.lower() + elif answer.lower() == groundtruth_str.lower(): + return True + elif groundtruth_num is not None: + if isinstance(groundtruth_num, (int, float)): + return compare_two_numbers(number_it(answer), groundtruth_num) + else: + if answer.startswith('(') and answer.endswith(')'): + try: + answer = list(eval(answer)) + answer = [number_it(a) for a in answer] + except Exception as e: + return False + return compare_two_list(answer, groundtruth_num) + else: + return False + else: + return False