mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
![]() |
import concurrent.futures
|
|||
|
import json
|
|||
|
import re
|
|||
|
|
|||
|
from datasets import Dataset
|
|||
|
|
|||
|
from opencompass.models import OpenAISDK
|
|||
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|||
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
|
|||
|
|
|||
|
from .base import BaseDataset
|
|||
|
|
|||
|
# from opencompass.utils import get_data_path
|
|||
|
|
|||
|
|
|||
|
EVAL_PROMPT = """
|
|||
|
请你作为一个数学高考阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准:
|
|||
|
1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题等,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。
|
|||
|
2. 有些答案可能通过不同的方式表达,比如有些答案可能是一个数学表达式,有些答案可能是一个文字描述,只要表达的意思一致即可。且有些公式通过不同的方式表达,但等价,也是正确的。
|
|||
|
3. 你不需要重新计算问题答案,因为标准答案已经给出,只需要根据问题形式来判断考生的答案是否与标准答案一致,是否正确即可。
|
|||
|
|
|||
|
请你根据上述标准,判断下面的答案是否与标准答案一致,如果一致,请在最后输出\\boxed{{yes}}, 否则输出\\boxed{{no}}, 如果难以判断,请输出\\boxed{{no}}.
|
|||
|
原问题:{question}
|
|||
|
标准答案:{gold_answer}
|
|||
|
考生答案:{answer}
|
|||
|
|
|||
|
分析:
|
|||
|
""" # noqa E501
|
|||
|
|
|||
|
|
|||
|
def extract_boxed_answer(text):
|
|||
|
match = re.findall(r'\\boxed{(.+?)}', text)
|
|||
|
if match:
|
|||
|
return match[-1]
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
@LOAD_DATASET.register_module()
|
|||
|
class GaoKaoMATHDataset(BaseDataset):
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def load(path: str):
|
|||
|
# path = get_data_path(path, local_mode=True)
|
|||
|
data = json.load(open(path))
|
|||
|
for i in range(len(data)):
|
|||
|
data[i]['extract_answer'] = str(data[i]['extract_answer'])
|
|||
|
dataset = Dataset.from_list(data)
|
|||
|
return dataset
|
|||
|
|
|||
|
|
|||
|
api_meta_template = dict(round=[
|
|||
|
dict(role='HUMAN', api_role='HUMAN'),
|
|||
|
dict(role='BOT', api_role='BOT', generate=True),
|
|||
|
])
|
|||
|
|
|||
|
|
|||
|
@ICL_EVALUATORS.register_module()
|
|||
|
class GaoKaoMATHEvaluator(BaseEvaluator):
|
|||
|
|
|||
|
def __init__(self, model_name, url, **kwargs):
|
|||
|
if isinstance(url, str):
|
|||
|
url = [url]
|
|||
|
|
|||
|
self.model = [
|
|||
|
MODELS.build(
|
|||
|
dict(
|
|||
|
type=OpenAISDK,
|
|||
|
path=model_name,
|
|||
|
openai_api_base=url,
|
|||
|
key='EMPTY',
|
|||
|
query_per_second=1,
|
|||
|
meta_template=api_meta_template,
|
|||
|
temperature=kwargs.get('temperature', 0.01),
|
|||
|
max_seq_len=kwargs.get('max_tokens', 8192),
|
|||
|
)) for url in url
|
|||
|
]
|
|||
|
|
|||
|
def batch_response(self, inputs):
|
|||
|
batch_num = len(self.model)
|
|||
|
batch_size = (len(inputs) + batch_num - 1) // batch_num
|
|||
|
result_responses = []
|
|||
|
|
|||
|
with concurrent.futures.ThreadPoolExecutor(
|
|||
|
max_workers=batch_num) as executor:
|
|||
|
futures = [
|
|||
|
executor.submit(self.model[i].generate,
|
|||
|
inputs[i * batch_size:(i + 1) * batch_size])
|
|||
|
for i in range(batch_num)
|
|||
|
]
|
|||
|
for response in executor.map(lambda f: f.result(), futures):
|
|||
|
result_responses.extend(response)
|
|||
|
|
|||
|
return result_responses
|
|||
|
|
|||
|
def score(self, predictions, references, origin_prompt):
|
|||
|
if len(predictions) != len(references):
|
|||
|
return {'error': 'preds and refrs have different length'}
|
|||
|
questions = [item[0]['prompt'] for item in origin_prompt]
|
|||
|
count = 0
|
|||
|
correct = 0
|
|||
|
details = []
|
|||
|
results = []
|
|||
|
inputs = []
|
|||
|
for pred, ref, ques in zip(predictions, references, questions):
|
|||
|
inputs.append(
|
|||
|
EVAL_PROMPT.format(answer=pred, gold_answer=ref,
|
|||
|
question=ques))
|
|||
|
|
|||
|
result_responses = self.batch_response(inputs)
|
|||
|
results = [
|
|||
|
extract_boxed_answer(result) == 'yes'
|
|||
|
for result in result_responses
|
|||
|
]
|
|||
|
for pred, ref, result, result_response in zip(predictions, references,
|
|||
|
results,
|
|||
|
result_responses):
|
|||
|
detail = {
|
|||
|
'pred': pred,
|
|||
|
'answer': ref,
|
|||
|
'correct': False,
|
|||
|
'eval_model_response': result_response
|
|||
|
}
|
|||
|
count += 1
|
|||
|
if result:
|
|||
|
correct += 1
|
|||
|
detail['correct'] = True
|
|||
|
details.append(detail)
|
|||
|
|
|||
|
detailed_result = {
|
|||
|
'accuracy': 100 * correct / count,
|
|||
|
'details': details
|
|||
|
}
|
|||
|
|
|||
|
return detailed_result
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1',
|
|||
|
temperature=0.01,
|
|||
|
max_tokens=2048,
|
|||
|
procs=8)
|
|||
|
predictions = ['1', '2', '3']
|
|||
|
references = ['1', '2', '3']
|
|||
|
evaluator.score(predictions, references)
|