mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
0530
This commit is contained in:
parent
be78346781
commit
e227acc1a8
@ -2,12 +2,9 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
SRbenchDataset,SRbenchDatasetEvaluator
|
||||
SRbenchDataset,SRbenchDatasetEvaluator,mydataset_postprocess
|
||||
)
|
||||
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
|
||||
|
||||
INFER_TEMPLATE = f'''
|
||||
You will be provided with a set of input-output pairs. Based on these data, infer the mathematical relationship between y and multiple input variables. Please note that the possible mathematical operations include: +, -, *, /, exp, sqrt, sin, arcsin, and constant terms.
|
||||
@ -16,7 +13,7 @@ INFER_TEMPLATE = f'''
|
||||
Based on the above data, please infer the possible formula. Ensure that your inference applies to all the provided data points, and consider both linear and nonlinear combinations.
|
||||
Verify whether your formula applies to the following new data point and adjust it to ensure accuracy:
|
||||
{{prompt2}}
|
||||
Finally, please output only the formula string you inferred (e.g. y=x_0 * x_1), without any additional information.
|
||||
Finally, please output only the formula string you inferred (e.g. z=x_0 * x_1), without any additional information.
|
||||
'''
|
||||
|
||||
srbench_reader_cfg = dict(input_columns=["prompt1","prompt2"], output_column='Formula')
|
||||
@ -41,6 +38,7 @@ srbench_infer_cfg = dict(
|
||||
|
||||
srbench_eval_cfg = dict(
|
||||
evaluator=dict(type=SRbenchDatasetEvaluator),
|
||||
pred_postprocessor=dict(type=mydataset_postprocess),
|
||||
path="opencompass/srbench",
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
@ -10,6 +10,7 @@ import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
import sympy as sp
|
||||
|
||||
@ -17,18 +18,18 @@ import sympy as sp
|
||||
class SRbenchDataset(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path: str,local_mode=True):
|
||||
path="path_to_dataset"
|
||||
base_path = get_data_path(path,local_mode=local_mode)
|
||||
base_path = get_data_path(path,local_mode=local_mode) # Resolve base path if necessary
|
||||
formula_csv_path = os.path.join(base_path, f'FeynmanEquation_23.csv')
|
||||
data_files_base_dir = os.path.join(base_path, 'Feynman_with_units')
|
||||
processed_formulas_df = load_dataset('csv', data_files=formula_csv_path)['train']
|
||||
dataset = load_dataset('csv', data_files=formula_csv_path)['train']
|
||||
sample_data=[]
|
||||
prompt_1_out=[]
|
||||
prompt_2_out=[]
|
||||
for row in processed_formulas_df:
|
||||
for row in dataset:
|
||||
true_formula = str(row["Formula"])
|
||||
n_var=int(row["n_variables"])
|
||||
data_filename = str(row['Filename'])
|
||||
|
||||
data_file_path = os.path.join(data_files_base_dir, data_filename)
|
||||
full_dataset = np.loadtxt(data_file_path)
|
||||
rand_idx = np.random.choice(full_dataset.shape[0], 100, replace=False)
|
||||
@ -37,7 +38,9 @@ class SRbenchDataset(BaseDataset):
|
||||
sample_data.append(sampled_data_i.tolist())
|
||||
else:
|
||||
sample_data.append(sampled_data_i)
|
||||
if n_var == 2:
|
||||
# x = dataset[:, :n_var]
|
||||
# y_true = dataset[:, -1]
|
||||
if n_var==2:
|
||||
prompt_1 = '\n'.join([f'x0={x1:.4f}, x1={x2:.4f}, y={y:.4f}' for x1, x2, y in sampled_data_i[:-1]])
|
||||
prompt_2=f'x0={sampled_data_i[-1, 0]:.4f}, x1={sampled_data_i[-1, 1]:.4f}, y={sampled_data_i[-1, 2]:.4f}'
|
||||
else:
|
||||
@ -45,46 +48,29 @@ class SRbenchDataset(BaseDataset):
|
||||
prompt_2=f'x0={sampled_data_i[-1, 0]:.4f}, x1={sampled_data_i[-1, 1]:.4f},x3={sampled_data_i[-1, 2]:.4f}, y={sampled_data_i[-1, 3]:.4f}'
|
||||
prompt_1_out.append(prompt_1)
|
||||
prompt_2_out.append(prompt_2)
|
||||
processed_formulas_df=processed_formulas_df.add_column(name="prompt1",column=prompt_1_out)
|
||||
processed_formulas_df=processed_formulas_df.add_column(name="prompt2",column=prompt_2_out)
|
||||
processed_formulas_df=processed_formulas_df.add_column(name="data_samples_list",column=sample_data)
|
||||
processed_formulas_df = processed_formulas_df.rename_column('n_variables', 'n_var')
|
||||
return processed_formulas_df
|
||||
dataset=dataset.add_column(name="prompt1",column=prompt_1_out)
|
||||
dataset=dataset.add_column(name="prompt2",column=prompt_2_out)
|
||||
dataset=dataset.add_column(name="data_samples_list",column=sample_data)
|
||||
dataset = dataset.rename_column('n_variables', 'n_var')
|
||||
return dataset
|
||||
|
||||
def mydataset_postprocess(formula_str):
|
||||
|
||||
formula_str = formula_str.replace('×', '*').replace('·', '*').replace('÷', '/')
|
||||
formula_str = formula_str.replace('−', '-').replace('^', '**')
|
||||
formula_str = formula_str.replace('“', '"').replace('”', '"').replace('’', "'")
|
||||
formula_str = formula_str.replace('`', '').replace('$', '').strip()
|
||||
|
||||
formula_str = formula_str.split('\n')[0].strip()
|
||||
formula_str = re.sub(r'[^\w\s\+\-\*/\^\=\.\(\)]', '', formula_str)
|
||||
|
||||
# 5. 确保左右去空格
|
||||
return formula_str.strip()
|
||||
|
||||
class SRbenchDatasetEvaluator(BaseEvaluator):
|
||||
def __init__(self,
|
||||
local_mode: bool = True,path=""):
|
||||
self.dataset=SRbenchDataset.load(path="",local_mode=local_mode)
|
||||
def _send_request(self,messages, mllm='4o'):
|
||||
URL = f"your_api_url"
|
||||
API_KEY = "your_api_key"
|
||||
HEADERS = {
|
||||
'Accept': 'application/json',
|
||||
'Authorization': f'Bearer {API_KEY}',
|
||||
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
model = mllm
|
||||
count = 0
|
||||
while True and count < 20:
|
||||
count += 1
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"temperature": 0.6,
|
||||
"max_tokens": 50
|
||||
})
|
||||
session = requests.Session()
|
||||
session.keep_alive = False
|
||||
response = session.post(URL, headers=HEADERS, data=payload, verify=True)
|
||||
try:
|
||||
content = response.json()['choices'][0]['message']['content']
|
||||
break
|
||||
except:
|
||||
content=None
|
||||
pass
|
||||
|
||||
return content
|
||||
def parse_formula(self,formula_str, n_var=2):
|
||||
try:
|
||||
if '=' in formula_str:
|
||||
@ -108,56 +94,6 @@ class SRbenchDatasetEvaluator(BaseEvaluator):
|
||||
return sp.simplify(expr1 - expr2) == 0
|
||||
except Exception:
|
||||
return False
|
||||
def llm_evaluate(self,inferred_formula, true_formula, mllm='gpt-4o'):
|
||||
content = f'''
|
||||
You are given two mathematical formulas. Your task is to evaluate how structurally similar they are, and return a similarity score between 0 and 1.
|
||||
|
||||
The score should reflect how closely the formulas match in terms of:
|
||||
- Mathematical operations and structure (e.g., same use of +, *, sin, etc.)
|
||||
- Term arrangement and complexity
|
||||
- Overall symbolic expression and intent
|
||||
|
||||
A score of:
|
||||
- 1 means the formulas are structurally identical or mathematically equivalent
|
||||
- Around 0.8-0.9 means they are very similar but not identical
|
||||
- Around 0.5 means moderately similar (e.g., same overall shape but different terms)
|
||||
- Near 0 means structurally unrelated formulas
|
||||
|
||||
Do not consider numerical evaluation or specific input values — only the symbolic structure and mathematical form.
|
||||
|
||||
Formulas:
|
||||
Inferred Formula: {inferred_formula}
|
||||
True Formula: {true_formula}
|
||||
|
||||
ONLY RETURN [THE SIMILARITY SCORE]
|
||||
'''
|
||||
messages = [{"role": "user", "content": content}]
|
||||
similarity_score = self._send_request(messages, mllm=mllm)
|
||||
#print(similarity_score)
|
||||
specific_emoji = "😊"
|
||||
if similarity_score.endswith(specific_emoji):
|
||||
similarity_score = similarity_score[:-len(specific_emoji)].rstrip()
|
||||
if similarity_score.startswith("["):
|
||||
similarity_score = similarity_score[1:]
|
||||
if similarity_score.endswith("]"):
|
||||
similarity_score = similarity_score[:-1]
|
||||
if similarity_score == ".":
|
||||
similarity_score= 0.0
|
||||
if similarity_score.endswith(specific_emoji):
|
||||
similarity_score = similarity_score[:-len(specific_emoji)].rstrip()
|
||||
return similarity_score
|
||||
|
||||
def llm_translate(self,dirty_formula, mllm='gpt-4o'):
|
||||
content = f'''
|
||||
This is a language model's judgment on a mathematical formula. Please help me extract the mathematical formula from this judgment and return it:
|
||||
{dirty_formula}
|
||||
Please serve pi as pi and use x0, x1, x2,... to represent the variable names.
|
||||
ONLY RETURN THE FORMULA STRING (Not LATEX).
|
||||
'''
|
||||
messages = [{"role": "user", "content": content}]
|
||||
clean_formula = _send_request(messages, mllm=mllm)
|
||||
return clean_formula
|
||||
|
||||
|
||||
def score(self, predictions, references) -> dict:
|
||||
metrics = {
|
||||
@ -179,9 +115,9 @@ class SRbenchDatasetEvaluator(BaseEvaluator):
|
||||
'R2': pd.Series(dtype=float),
|
||||
'SymbolicMatch': pd.Series(dtype=bool)
|
||||
})
|
||||
|
||||
for row in range(len(references)):
|
||||
metrics['LLM_Score'] = float(self.llm_evaluate(predictions[row], references[row], mllm='gpt-4o'))
|
||||
#metrics['LLM_Score'] = float(self.llm_evaluate(predictions[row], references[row], mllm='gpt-4o'))
|
||||
print(self.dataset[row]["n_var"])
|
||||
n_var=self.dataset[row]["n_var"]
|
||||
y_true=references[row]
|
||||
func = self.parse_formula(predictions[row], n_var=n_var)
|
||||
@ -197,12 +133,11 @@ class SRbenchDatasetEvaluator(BaseEvaluator):
|
||||
pass
|
||||
else:
|
||||
metrics["R2"]=0
|
||||
metrics["RMSE"]= root_mean_squared_error(y_true, y_pred)
|
||||
metrics["RMSE"]= np.inf
|
||||
metrics['SymbolicMatch'] = self.is_symbolically_equivalent(predictions[row], references[row], n_var)
|
||||
result = result._append({
|
||||
'GT': references[row],
|
||||
'Pred': predictions[row],
|
||||
'Score': metrics['LLM_Score'],
|
||||
'RMSE': metrics['RMSE'],
|
||||
'R2': metrics['R2'],
|
||||
'SymbolicMatch': bool(metrics['SymbolicMatch'])
|
||||
@ -211,12 +146,15 @@ class SRbenchDatasetEvaluator(BaseEvaluator):
|
||||
if not result.empty:
|
||||
symbolic_accuracy = result['SymbolicMatch'].sum() / len(result)
|
||||
R2_out = result['R2'].sum() / len(result)
|
||||
Score_out = result['Score'].sum() / len(result)
|
||||
RMSE_out = result['RMSE'].sum() / len(result)
|
||||
|
||||
metrics_out={
|
||||
'LLM_Score': Score_out,
|
||||
'RMSE': RMSE_out,
|
||||
'R2': R2_out,
|
||||
"Accuracy":symbolic_accuracy
|
||||
}
|
||||
|
||||
return metrics_out
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user