diff --git a/opencompass/configs/datasets/srbench/srbench_gen.py b/opencompass/configs/datasets/srbench/srbench_gen.py index fe7fdf8e..fcdb2dfc 100644 --- a/opencompass/configs/datasets/srbench/srbench_gen.py +++ b/opencompass/configs/datasets/srbench/srbench_gen.py @@ -2,12 +2,9 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import ( - SRbenchDataset,SRbenchDatasetEvaluator + SRbenchDataset,SRbenchDatasetEvaluator,mydataset_postprocess ) -from opencompass.evaluator import GenericLLMEvaluator - - INFER_TEMPLATE = f''' You will be provided with a set of input-output pairs. Based on these data, infer the mathematical relationship between y and multiple input variables. Please note that the possible mathematical operations include: +, -, *, /, exp, sqrt, sin, arcsin, and constant terms. @@ -16,7 +13,7 @@ INFER_TEMPLATE = f''' Based on the above data, please infer the possible formula. Ensure that your inference applies to all the provided data points, and consider both linear and nonlinear combinations. Verify whether your formula applies to the following new data point and adjust it to ensure accuracy: {{prompt2}} - Finally, please output only the formula string you inferred (e.g. y=x_0 * x_1), without any additional information. + Finally, please output only the formula string you inferred (e.g. z=x_0 * x_1), without any additional information. ''' srbench_reader_cfg = dict(input_columns=["prompt1","prompt2"], output_column='Formula') @@ -41,6 +38,7 @@ srbench_infer_cfg = dict( srbench_eval_cfg = dict( evaluator=dict(type=SRbenchDatasetEvaluator), + pred_postprocessor=dict(type=mydataset_postprocess), path="opencompass/srbench", pred_role='BOT', ) diff --git a/opencompass/datasets/srbench.py b/opencompass/datasets/srbench.py index f2441df7..b0e8629b 100644 --- a/opencompass/datasets/srbench.py +++ b/opencompass/datasets/srbench.py @@ -10,6 +10,7 @@ import os import numpy as np import pandas as pd import json +import re import requests import sympy as sp @@ -17,18 +18,18 @@ import sympy as sp class SRbenchDataset(BaseDataset): @staticmethod def load(path: str,local_mode=True): - path="path_to_dataset" - base_path = get_data_path(path,local_mode=local_mode) + base_path = get_data_path(path,local_mode=local_mode) # Resolve base path if necessary formula_csv_path = os.path.join(base_path, f'FeynmanEquation_23.csv') data_files_base_dir = os.path.join(base_path, 'Feynman_with_units') - processed_formulas_df = load_dataset('csv', data_files=formula_csv_path)['train'] + dataset = load_dataset('csv', data_files=formula_csv_path)['train'] sample_data=[] prompt_1_out=[] prompt_2_out=[] - for row in processed_formulas_df: + for row in dataset: true_formula = str(row["Formula"]) n_var=int(row["n_variables"]) data_filename = str(row['Filename']) + data_file_path = os.path.join(data_files_base_dir, data_filename) full_dataset = np.loadtxt(data_file_path) rand_idx = np.random.choice(full_dataset.shape[0], 100, replace=False) @@ -37,7 +38,9 @@ class SRbenchDataset(BaseDataset): sample_data.append(sampled_data_i.tolist()) else: sample_data.append(sampled_data_i) - if n_var == 2: + # x = dataset[:, :n_var] + # y_true = dataset[:, -1] + if n_var==2: prompt_1 = '\n'.join([f'x0={x1:.4f}, x1={x2:.4f}, y={y:.4f}' for x1, x2, y in sampled_data_i[:-1]]) prompt_2=f'x0={sampled_data_i[-1, 0]:.4f}, x1={sampled_data_i[-1, 1]:.4f}, y={sampled_data_i[-1, 2]:.4f}' else: @@ -45,46 +48,29 @@ class SRbenchDataset(BaseDataset): prompt_2=f'x0={sampled_data_i[-1, 0]:.4f}, x1={sampled_data_i[-1, 1]:.4f},x3={sampled_data_i[-1, 2]:.4f}, y={sampled_data_i[-1, 3]:.4f}' prompt_1_out.append(prompt_1) prompt_2_out.append(prompt_2) - processed_formulas_df=processed_formulas_df.add_column(name="prompt1",column=prompt_1_out) - processed_formulas_df=processed_formulas_df.add_column(name="prompt2",column=prompt_2_out) - processed_formulas_df=processed_formulas_df.add_column(name="data_samples_list",column=sample_data) - processed_formulas_df = processed_formulas_df.rename_column('n_variables', 'n_var') - return processed_formulas_df + dataset=dataset.add_column(name="prompt1",column=prompt_1_out) + dataset=dataset.add_column(name="prompt2",column=prompt_2_out) + dataset=dataset.add_column(name="data_samples_list",column=sample_data) + dataset = dataset.rename_column('n_variables', 'n_var') + return dataset + +def mydataset_postprocess(formula_str): + + formula_str = formula_str.replace('×', '*').replace('·', '*').replace('÷', '/') + formula_str = formula_str.replace('−', '-').replace('^', '**') + formula_str = formula_str.replace('“', '"').replace('”', '"').replace('’', "'") + formula_str = formula_str.replace('`', '').replace('$', '').strip() + + formula_str = formula_str.split('\n')[0].strip() + formula_str = re.sub(r'[^\w\s\+\-\*/\^\=\.\(\)]', '', formula_str) + + # 5. 确保左右去空格 + return formula_str.strip() class SRbenchDatasetEvaluator(BaseEvaluator): def __init__(self, local_mode: bool = True,path=""): self.dataset=SRbenchDataset.load(path="",local_mode=local_mode) - def _send_request(self,messages, mllm='4o'): - URL = f"your_api_url" - API_KEY = "your_api_key" - HEADERS = { - 'Accept': 'application/json', - 'Authorization': f'Bearer {API_KEY}', - 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)', - 'Content-Type': 'application/json' - } - model = mllm - count = 0 - while True and count < 20: - count += 1 - payload = json.dumps({ - "model": model, - "messages": messages, - "temperature": 0.6, - "max_tokens": 50 - }) - session = requests.Session() - session.keep_alive = False - response = session.post(URL, headers=HEADERS, data=payload, verify=True) - try: - content = response.json()['choices'][0]['message']['content'] - break - except: - content=None - pass - - return content def parse_formula(self,formula_str, n_var=2): try: if '=' in formula_str: @@ -108,56 +94,6 @@ class SRbenchDatasetEvaluator(BaseEvaluator): return sp.simplify(expr1 - expr2) == 0 except Exception: return False - def llm_evaluate(self,inferred_formula, true_formula, mllm='gpt-4o'): - content = f''' - You are given two mathematical formulas. Your task is to evaluate how structurally similar they are, and return a similarity score between 0 and 1. - - The score should reflect how closely the formulas match in terms of: - - Mathematical operations and structure (e.g., same use of +, *, sin, etc.) - - Term arrangement and complexity - - Overall symbolic expression and intent - - A score of: - - 1 means the formulas are structurally identical or mathematically equivalent - - Around 0.8-0.9 means they are very similar but not identical - - Around 0.5 means moderately similar (e.g., same overall shape but different terms) - - Near 0 means structurally unrelated formulas - - Do not consider numerical evaluation or specific input values — only the symbolic structure and mathematical form. - - Formulas: - Inferred Formula: {inferred_formula} - True Formula: {true_formula} - - ONLY RETURN [THE SIMILARITY SCORE] - ''' - messages = [{"role": "user", "content": content}] - similarity_score = self._send_request(messages, mllm=mllm) - #print(similarity_score) - specific_emoji = "😊" - if similarity_score.endswith(specific_emoji): - similarity_score = similarity_score[:-len(specific_emoji)].rstrip() - if similarity_score.startswith("["): - similarity_score = similarity_score[1:] - if similarity_score.endswith("]"): - similarity_score = similarity_score[:-1] - if similarity_score == ".": - similarity_score= 0.0 - if similarity_score.endswith(specific_emoji): - similarity_score = similarity_score[:-len(specific_emoji)].rstrip() - return similarity_score - - def llm_translate(self,dirty_formula, mllm='gpt-4o'): - content = f''' - This is a language model's judgment on a mathematical formula. Please help me extract the mathematical formula from this judgment and return it: - {dirty_formula} - Please serve pi as pi and use x0, x1, x2,... to represent the variable names. - ONLY RETURN THE FORMULA STRING (Not LATEX). - ''' - messages = [{"role": "user", "content": content}] - clean_formula = _send_request(messages, mllm=mllm) - return clean_formula - def score(self, predictions, references) -> dict: metrics = { @@ -179,9 +115,9 @@ class SRbenchDatasetEvaluator(BaseEvaluator): 'R2': pd.Series(dtype=float), 'SymbolicMatch': pd.Series(dtype=bool) }) - for row in range(len(references)): - metrics['LLM_Score'] = float(self.llm_evaluate(predictions[row], references[row], mllm='gpt-4o')) + #metrics['LLM_Score'] = float(self.llm_evaluate(predictions[row], references[row], mllm='gpt-4o')) + print(self.dataset[row]["n_var"]) n_var=self.dataset[row]["n_var"] y_true=references[row] func = self.parse_formula(predictions[row], n_var=n_var) @@ -197,12 +133,11 @@ class SRbenchDatasetEvaluator(BaseEvaluator): pass else: metrics["R2"]=0 - metrics["RMSE"]= root_mean_squared_error(y_true, y_pred) + metrics["RMSE"]= np.inf metrics['SymbolicMatch'] = self.is_symbolically_equivalent(predictions[row], references[row], n_var) result = result._append({ 'GT': references[row], 'Pred': predictions[row], - 'Score': metrics['LLM_Score'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2'], 'SymbolicMatch': bool(metrics['SymbolicMatch']) @@ -211,12 +146,15 @@ class SRbenchDatasetEvaluator(BaseEvaluator): if not result.empty: symbolic_accuracy = result['SymbolicMatch'].sum() / len(result) R2_out = result['R2'].sum() / len(result) - Score_out = result['Score'].sum() / len(result) RMSE_out = result['RMSE'].sum() / len(result) + metrics_out={ - 'LLM_Score': Score_out, 'RMSE': RMSE_out, 'R2': R2_out, "Accuracy":symbolic_accuracy } + return metrics_out + + +