OpenCompass/opencompass/datasets/gsm8k.py
Hubert bb2ecf416e
[Feat] Support cibench (#538)
* [Feat] support cidataset

* [Feat] support cidataset

* [Feat] support cidataset

* [Feat] support cidataset

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* rename cibench

* rename cibench

* rename cibench

* rename cibench

* minor fix

* minor fix

* minor fix
2023-11-07 19:11:44 +08:00

114 lines
3.4 KiB
Python

from opencompass.openicl import BaseEvaluator
from opencompass.registry import TEXT_POSTPROCESSORS
@TEXT_POSTPROCESSORS.register_module('gsm8k_dataset')
def gsm8k_dataset_postprocess(text: str) -> str:
return text.split('#### ')[1].replace(',', '')
@TEXT_POSTPROCESSORS.register_module('gsm8k')
def gsm8k_postprocess(text: str) -> str:
text = text.split('\n\n')[0]
text = text.split(' ')[::-1]
flag = False
ret = ''
for i in range(len(text)):
s = text[i]
for i in range(len(s)):
if s[i].isdigit():
flag = True
ret = s
break
if flag:
break
ret1 = ''
for i in range(len(ret)):
if ret[i].isdigit():
ret1 += ret[i]
return ret1
class Gsm8kEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
detail = {'pred': i, 'answers': j, 'correct': False}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
class Gsm8kAgentEvaluator(BaseEvaluator):
"""Gsm8k agent evaluator for soft condition.
Args:
action (str): Action for catching internal prediction.
Defaults to `PythonInterpreter`.
"""
def __init__(self, action: str = 'PythonInterpreter'):
self.action = action
def soft_equal(self, pred, refer, step):
try:
soft_pred = step['result']['text']
if str(int(float(soft_pred))) == refer:
return True
except Exception:
# result might not exists
# text cannot convert to float
print(pred, soft_pred, refer)
return False
def get_action(self, step):
for s in step[::-1]:
if s['type'] == self.action:
return s
def score(self, predictions, references, steps):
"""Calculate accuracy."""
row_reasoning_scope = 0
action_scope = 0
code_scope = 0
reasoning_scope = 0
final_scope = 0
total = len(references)
for pred, refer, step in zip(predictions, references, steps):
# if final answer right
if pred == refer:
if self.get_action(step):
final_scope += 1
else:
row_reasoning_scope += 1
else:
s = self.get_action(step)
if s:
action_scope += 1
if not s['errmsg']:
code_scope += 1
# whether action result is correct
reasoning_scope += self.soft_equal(pred, refer, s)
result = dict(
follow_acc=100 * (row_reasoning_scope + final_scope) / total,
reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) / total,
action_acc=100 * (action_scope + final_scope) / total,
)
return result