diff --git a/configs/datasets/math/math_agent_gen_861b4f.py b/configs/datasets/math/math_agent_gen_861b4f.py new file mode 100644 index 00000000..7e6a0e91 --- /dev/null +++ b/configs/datasets/math/math_agent_gen_861b4f.py @@ -0,0 +1,89 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math', + type=MATHDataset, + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py index d7655e82..d0ca8365 100644 --- a/opencompass/datasets/gsm8k.py +++ b/opencompass/datasets/gsm8k.py @@ -49,9 +49,10 @@ def gsm8k_postprocess(text: str) -> str: break ret1 = '' for i in range(len(ret)): - if ret[i].isdigit(): + # deal with potential float number + if ret[i].isdigit() or ret[i] == '.': ret1 += ret[i] - return ret1 + return ret1.strip('.') class Gsm8kEvaluator(BaseEvaluator): @@ -87,15 +88,23 @@ class Gsm8kAgentEvaluator(BaseEvaluator): def __init__(self, action: str = 'PythonInterpreter'): self.action = action + def is_equal(self, pred, refer): + try: + if pred == refer or abs(float(pred) - int(refer)) < 1e-6: + return True + except Exception: + pass + return False + def soft_equal(self, pred, refer, step): try: soft_pred = step['result']['text'] - if str(int(float(soft_pred))) == refer: + if abs(float(soft_pred) - int(refer)) < 1e-6: return True except Exception: # result might not exists # text cannot convert to float - print(pred, soft_pred, refer) + pass return False def get_action(self, step): @@ -114,7 +123,7 @@ class Gsm8kAgentEvaluator(BaseEvaluator): total = len(references) for pred, refer, step in zip(predictions, references, steps): # if final answer right - if pred == refer: + if self.is_equal(pred, refer): if self.get_action(step): final_scope += 1 else: diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py index 3b939db9..ad3dedad 100644 --- a/opencompass/models/huggingface.py +++ b/opencompass/models/huggingface.py @@ -49,6 +49,9 @@ class HuggingFace(BaseModel): use_fastchat_template (str, optional): Whether to use fastchat to get the conversation template. If True, fastchat needs to be implemented first. Defaults to False. + end_str (str, optional): Whether to trim generated strings with end_str + if the model has special ending strings that are not handled well. + Defaults to None. Note: About ``extract_pred_after_decode``: Commonly, we should extract the @@ -72,7 +75,8 @@ class HuggingFace(BaseModel): batch_padding: bool = False, pad_token_id: Optional[int] = None, mode: str = 'none', - use_fastchat_template: bool = False): + use_fastchat_template: bool = False, + end_str: Optional[str] = None): super().__init__(path=path, max_seq_len=max_seq_len, tokenizer_only=tokenizer_only, @@ -96,6 +100,7 @@ class HuggingFace(BaseModel): peft_path=peft_path) self.generation_kwargs = generation_kwargs self.use_fastchat_template = use_fastchat_template + self.end_str = end_str def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], tokenizer_kwargs: dict): @@ -266,6 +271,8 @@ class HuggingFace(BaseModel): token[len_:] for token, len_ in zip(decodeds, prompt_lens) ] + if self.end_str: + decodeds = [token.split(self.end_str)[0] for token in decodeds] return decodeds def _single_generate(self, inputs: List[str], max_out_len: int, @@ -329,6 +336,8 @@ class HuggingFace(BaseModel): token[len_:] for token, len_ in zip(decodeds, prompt_lens) ] + if self.end_str: + decodeds = [token.split(self.end_str)[0] for token in decodeds] return decodeds def get_logits(self, inputs: List[str]):