mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
add HuProverbRea dataset (20250203)
This commit is contained in:
parent
794ab7c372
commit
f152ccf127
16
examples/eval_OpenHuEval_HuProverbRea_2CQ.py
Normal file
16
examples/eval_OpenHuEval_HuProverbRea_2CQ.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets
|
||||||
|
|
||||||
|
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||||
|
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||||
|
|
||||||
|
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||||
|
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||||
|
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
|
||||||
|
|
||||||
|
datasets = HuProverbRea_datasets
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
|
16
examples/eval_OpenHuEval_HuProverbRea_OE.py
Normal file
16
examples/eval_OpenHuEval_HuProverbRea_OE.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets
|
||||||
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
|
||||||
|
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||||
|
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||||
|
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||||
|
|
||||||
|
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||||
|
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||||
|
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
|
||||||
|
|
||||||
|
datasets = HuProverbRea_datasets
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
|
@ -0,0 +1,49 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .prompts import INSTRUCTIONS_DIRECT_QA
|
||||||
|
|
||||||
|
# currently we use English prompts with hu proverbs inserted
|
||||||
|
prompt_template_language = 'en'
|
||||||
|
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
|
||||||
|
|
||||||
|
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
|
||||||
|
output_column='out')
|
||||||
|
|
||||||
|
HuProverbRea_datasets = []
|
||||||
|
instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language]
|
||||||
|
HuProverbRea_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin='</E>',
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=instruction
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
ice_token='</E>',
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ))
|
||||||
|
|
||||||
|
HuProverbRea_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'HuProverbRea_2CQ_{prompt_template_language}',
|
||||||
|
type=HuProverbDataset2CQ,
|
||||||
|
path=dataset_path,
|
||||||
|
reader_cfg=HuProverbRea_reader_cfg,
|
||||||
|
infer_cfg=HuProverbRea_infer_cfg,
|
||||||
|
eval_cfg=HuProverbRea_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -0,0 +1,49 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .prompts import INSTRUCTIONS_OE_DIR_QA
|
||||||
|
|
||||||
|
# currently we use English prompts with hu proverbs inserted
|
||||||
|
prompt_template_language = 'en'
|
||||||
|
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
|
||||||
|
|
||||||
|
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
|
||||||
|
output_column='out')
|
||||||
|
|
||||||
|
HuProverbRea_datasets = []
|
||||||
|
instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language]
|
||||||
|
HuProverbRea_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin='</E>',
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=instruction
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
ice_token='</E>',
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE))
|
||||||
|
|
||||||
|
HuProverbRea_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'HuProverbRea_OE_{prompt_template_language}',
|
||||||
|
type=HuProverbDatasetOE,
|
||||||
|
path=dataset_path,
|
||||||
|
reader_cfg=HuProverbRea_reader_cfg,
|
||||||
|
infer_cfg=HuProverbRea_infer_cfg,
|
||||||
|
eval_cfg=HuProverbRea_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
@ -0,0 +1,70 @@
|
|||||||
|
INSTRUCTIONS_DIRECT_QA = {
|
||||||
|
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Hungarian Phrase:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
"'{hu_text}'\n" +
|
||||||
|
'######################\n\n' +
|
||||||
|
'and a context using this phrase:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Hungarian Context:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
'{context}\n' +
|
||||||
|
'######################\n\n' +
|
||||||
|
'What does the person mean by using this phrase? Please select one correct answer from the following two options:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Options:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
'Option 1: {option1}\n' +
|
||||||
|
'Option 2: {option2}\n' +
|
||||||
|
'######################\n\n' +
|
||||||
|
"You should only answer the option number, '1' or '2'. Do not output any other content other than the option number. Your answer:"
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTRUCTIONS_OE_DIR_QA = {
|
||||||
|
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Hungarian Phrase:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
"'{hu_text}'\n" +
|
||||||
|
'######################\n\n' +
|
||||||
|
'and a context using this phrase:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Hungarian Context:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
'{context}\n' +
|
||||||
|
'######################\n\n' +
|
||||||
|
'What does the person mean by using this phrase? Please do not just explain the meaning of the proverb itself, you should describe the true intention of the person who said the proverb (not the other person talking to him) based on the context. Please answer concisely in one sentence:',
|
||||||
|
'hu': 'Ön magyar nyelvi szakértő. Adott egy magyar kifejezés:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Magyar kifejezés:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
"'{hu_text}'\n" +
|
||||||
|
'######################\n\n' +
|
||||||
|
'és egy szövegkörnyezet, amely ezt a kifejezést használja:\n\n' +
|
||||||
|
'######################\n' +
|
||||||
|
'Magyar kontextus:\n' +
|
||||||
|
'----------------------\n' +
|
||||||
|
'{context}\n' +
|
||||||
|
'######################\n\n' +
|
||||||
|
'Mire gondol az illető, amikor ezt a kifejezést használja? Kérjük, ne csak magának a közmondásnak a jelentését magyarázza meg, hanem a szövegkörnyezet alapján írja le a közmondást kimondó személy (nem a vele beszélgető másik személy) valódi szándékát. Kérjük, válaszoljon tömören, egy mondatban:'
|
||||||
|
}
|
||||||
|
|
||||||
|
judge_prompt_template = {
|
||||||
|
'en_system':
|
||||||
|
"Please act as an impartial judge specialized in Hungarian language and culture. Given a Hungarian saying, a context using that saying, and two analyses explaining 'what does the person mean by using that saying in the context?', please decide whether the given two analyses express the same meaning. If they reflect the same understanding of the saying's meaning, you should answer YES. If they are based on different interpretations of the saying, you should answer NO. Do not output anything other than 'YES' or 'NO'. Avoid any position biases and ensure that the order in which the analyses were presented does not influence your decision. Do not allow the length of the analyses to influence your judge, focus on their core meanings and their understandings of the Hungarian saying.",
|
||||||
|
'en_user':
|
||||||
|
'[The start of Hungarian saying]\n' +
|
||||||
|
'{proverb}\n' +
|
||||||
|
'[The end of Hungarian saying]\n\n' +
|
||||||
|
'[The start of the context]\n' +
|
||||||
|
'{conversation}\n' +
|
||||||
|
'[The end of the context]\n\n' +
|
||||||
|
'[The start of the first analysis]\n' +
|
||||||
|
'{answer}\n' +
|
||||||
|
'[The end of the first analysis]\n\n' +
|
||||||
|
'[The start of the second analysis]\n' +
|
||||||
|
'{raw_pred}\n'+
|
||||||
|
'[The end of the second analysis]\n\n' +
|
||||||
|
'Your decision:'
|
||||||
|
}
|
295
opencompass/datasets/OpenHuEval/HuProverbRea.py
Normal file
295
opencompass/datasets/OpenHuEval/HuProverbRea.py
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \
|
||||||
|
judge_prompt_template
|
||||||
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
|
from opencompass.utils.prompt import PromptList
|
||||||
|
|
||||||
|
from ..base import BaseDataset
|
||||||
|
|
||||||
|
|
||||||
|
class HuProverbDataset2CQ(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(**kwargs):
|
||||||
|
path = kwargs.get('path', None)
|
||||||
|
dataset = DatasetDict()
|
||||||
|
sub_dataset_file = os.path.join(path,
|
||||||
|
'{}.jsonl'.format('HuProverbRea'))
|
||||||
|
f = open(sub_dataset_file, 'r', encoding='utf-8')
|
||||||
|
lines = f.readlines()
|
||||||
|
out_dict_list = []
|
||||||
|
for line in lines:
|
||||||
|
obj = json.loads(line)
|
||||||
|
if len(obj['context']) > 1:
|
||||||
|
obj['context'] = '\n'.join(
|
||||||
|
[x.strip() for x in obj['context'] if x])
|
||||||
|
else:
|
||||||
|
obj['context'] = obj['context'][0]
|
||||||
|
|
||||||
|
if obj['answer'] == 0:
|
||||||
|
cor_ops = obj['options'][0]
|
||||||
|
w_ops = obj['options'][1]
|
||||||
|
else:
|
||||||
|
cor_ops = obj['options'][1]
|
||||||
|
w_ops = obj['options'][0]
|
||||||
|
|
||||||
|
new_obj_1 = {
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'option1': cor_ops,
|
||||||
|
'option2': w_ops,
|
||||||
|
'out': {
|
||||||
|
'true_ans': '1',
|
||||||
|
'id': obj['qid'],
|
||||||
|
'source_id': obj['source_info']['source_id'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'en_trans': obj['source_info']['en_trans'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'option1': cor_ops,
|
||||||
|
'option2': w_ops,
|
||||||
|
'correct': cor_ops,
|
||||||
|
'incorrect': w_ops
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new_obj_2 = {
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'option1': w_ops,
|
||||||
|
'option2': cor_ops,
|
||||||
|
'out': {
|
||||||
|
'true_ans': '2',
|
||||||
|
'id': obj['qid'],
|
||||||
|
'source_id': obj['source_info']['source_id'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'en_trans': obj['source_info']['en_trans'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'option1': w_ops,
|
||||||
|
'option2': cor_ops,
|
||||||
|
'correct': cor_ops,
|
||||||
|
'incorrect': w_ops
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_dict_list.append(new_obj_1)
|
||||||
|
out_dict_list.append(new_obj_2)
|
||||||
|
dataset = Dataset.from_list(out_dict_list)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
class HuProverbDatasetOE(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(**kwargs):
|
||||||
|
path = kwargs.get('path', None)
|
||||||
|
dataset = DatasetDict()
|
||||||
|
sub_dataset_file = os.path.join(path,
|
||||||
|
'{}.jsonl'.format('HuProverbRea'))
|
||||||
|
f = open(sub_dataset_file, 'r', encoding='utf-8')
|
||||||
|
lines = f.readlines()
|
||||||
|
out_dict_list = []
|
||||||
|
for line in lines:
|
||||||
|
obj = json.loads(line)
|
||||||
|
if len(obj['context']) > 1:
|
||||||
|
obj['context'] = '\n'.join(
|
||||||
|
[x.strip() for x in obj['context'] if x])
|
||||||
|
else:
|
||||||
|
obj['context'] = obj['context'][0]
|
||||||
|
|
||||||
|
if obj['answer'] == 0:
|
||||||
|
cor_ops = obj['options'][0]
|
||||||
|
w_ops = obj['options'][1]
|
||||||
|
else:
|
||||||
|
cor_ops = obj['options'][1]
|
||||||
|
w_ops = obj['options'][0]
|
||||||
|
new_obj = {
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'out': {
|
||||||
|
'id': obj['qid'],
|
||||||
|
'source_id': obj['source_info']['source_id'],
|
||||||
|
'en_expl': obj['source_info']['en_expl'],
|
||||||
|
'en_trans': obj['source_info']['en_trans'],
|
||||||
|
'hu_expl': obj['source_info']['hu_expl'],
|
||||||
|
'hu_text': obj['source_info']['proverb'],
|
||||||
|
'context': obj['context'],
|
||||||
|
'correct': cor_ops,
|
||||||
|
'incorrect': w_ops
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out_dict_list.append(new_obj)
|
||||||
|
dataset = Dataset.from_list(out_dict_list)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
class HuProverb_Evaluator_2CQ(BaseEvaluator):
|
||||||
|
"""
|
||||||
|
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
|
||||||
|
"""
|
||||||
|
|
||||||
|
def score(self, predictions, references, origin_prompt) -> dict:
|
||||||
|
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {'error': 'preds and refrs have different length.'}
|
||||||
|
|
||||||
|
details = {}
|
||||||
|
total, correct, incorrect, fail_parse = 0, 0, 0, 0
|
||||||
|
for raw_pred, detail, ori_prompt in zip(predictions, references,
|
||||||
|
origin_prompt):
|
||||||
|
idx = detail['id']
|
||||||
|
option1 = detail['option1']
|
||||||
|
option2 = detail['option2']
|
||||||
|
true_ans = detail['true_ans']
|
||||||
|
res_of_this_round = {
|
||||||
|
'origin_prompt': ori_prompt,
|
||||||
|
'is_correct': False,
|
||||||
|
'is_incorrect': False,
|
||||||
|
'is_fail_parse': False,
|
||||||
|
'option1': option1,
|
||||||
|
'option2': option2,
|
||||||
|
'true_ans': true_ans,
|
||||||
|
'raw_pred': raw_pred
|
||||||
|
}
|
||||||
|
# parse ans from raw pred
|
||||||
|
if '1' in raw_pred and '2' not in raw_pred:
|
||||||
|
ans = '1'
|
||||||
|
elif '2' in raw_pred and '1' not in raw_pred:
|
||||||
|
ans = '2'
|
||||||
|
else:
|
||||||
|
ans = ''
|
||||||
|
res_of_this_round['parsed_pred'] = ans
|
||||||
|
if ans == true_ans:
|
||||||
|
res_of_this_round['is_correct'] = True
|
||||||
|
elif ans == '':
|
||||||
|
res_of_this_round['is_fail_parse'] = True
|
||||||
|
else:
|
||||||
|
res_of_this_round['is_incorrect'] = True
|
||||||
|
|
||||||
|
if idx not in details:
|
||||||
|
total += 1
|
||||||
|
details[idx] = {
|
||||||
|
'detail': {
|
||||||
|
'hu_text': detail['hu_text'],
|
||||||
|
'en_trans': detail['en_trans'],
|
||||||
|
'en_expl': detail['en_expl'],
|
||||||
|
'hu_expl': detail['hu_expl'],
|
||||||
|
'context': detail['context'],
|
||||||
|
'correct': detail['correct'],
|
||||||
|
'incorrect': detail['incorrect']
|
||||||
|
},
|
||||||
|
'flipped_variance': [res_of_this_round],
|
||||||
|
'is_correct': False,
|
||||||
|
'is_incorrect': False,
|
||||||
|
'is_fail_parse': False
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
details[idx]['flipped_variance'].append(res_of_this_round)
|
||||||
|
# judge the results
|
||||||
|
if details[idx]['flipped_variance'][0][
|
||||||
|
'is_correct'] and details[idx]['flipped_variance'][1][
|
||||||
|
'is_correct']:
|
||||||
|
correct += 1
|
||||||
|
details[idx]['is_correct'] = True
|
||||||
|
elif details[idx]['flipped_variance'][0][
|
||||||
|
'is_fail_parse'] or details[idx]['flipped_variance'][
|
||||||
|
1]['is_fail_parse']:
|
||||||
|
fail_parse += 1
|
||||||
|
details[idx]['is_fail_parse'] = True
|
||||||
|
else:
|
||||||
|
incorrect += 1
|
||||||
|
details[idx]['is_incorrect'] = True
|
||||||
|
|
||||||
|
assert total == correct + incorrect + fail_parse
|
||||||
|
results = {
|
||||||
|
'correct_ratio': correct / total * 100,
|
||||||
|
'incorrect_ratio': incorrect / total * 100,
|
||||||
|
'fail_parse_ratio': fail_parse / total * 100,
|
||||||
|
'details': details
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class HuProverb_Evaluator_OE(BaseEvaluator):
|
||||||
|
"""
|
||||||
|
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
|
||||||
|
"""
|
||||||
|
|
||||||
|
def score(self, predictions, references, origin_prompt) -> dict:
|
||||||
|
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {'error': 'preds and refrs have different length.'}
|
||||||
|
|
||||||
|
details = {}
|
||||||
|
total, correct, wrong, unclear = 0, 0, 0, 0
|
||||||
|
from opencompass.models import OpenAI
|
||||||
|
model = OpenAI(path='gpt-4o',
|
||||||
|
max_seq_len=8192,
|
||||||
|
retry=2,
|
||||||
|
temperature=0.1)
|
||||||
|
for raw_pred, detail in zip(predictions, references):
|
||||||
|
total += 1
|
||||||
|
qid = detail['id']
|
||||||
|
details[qid] = {
|
||||||
|
'proverb': detail['hu_text'],
|
||||||
|
'conversation': detail['context'],
|
||||||
|
'answer': detail['correct'],
|
||||||
|
'raw_pred': raw_pred,
|
||||||
|
'correctness': False,
|
||||||
|
'ans_fail_parse': False
|
||||||
|
}
|
||||||
|
|
||||||
|
# ------------------------------------------- openai judge
|
||||||
|
user_prompt = judge_prompt_template['en_user'].format(
|
||||||
|
proverb=detail['hu_text'],
|
||||||
|
conversation=detail['context'],
|
||||||
|
answer=detail['correct'],
|
||||||
|
raw_pred=raw_pred)
|
||||||
|
system_prompt = judge_prompt_template['en_system']
|
||||||
|
details[qid]['judge_user_prompt'] = user_prompt
|
||||||
|
|
||||||
|
messages = PromptList([{
|
||||||
|
'role': 'SYSTEM',
|
||||||
|
'prompt': system_prompt,
|
||||||
|
}, {
|
||||||
|
'role': 'HUMAN',
|
||||||
|
'prompt': user_prompt,
|
||||||
|
}])
|
||||||
|
response = model._generate(input=messages,
|
||||||
|
max_out_len=8192,
|
||||||
|
temperature=0.1)
|
||||||
|
details[qid]['judge_resp'] = response
|
||||||
|
|
||||||
|
if 'yes' in response.lower() and 'no' not in response.lower():
|
||||||
|
correct += 1
|
||||||
|
details[qid]['correctness'] = True
|
||||||
|
elif 'no' in response.lower() and 'yes' not in response.lower():
|
||||||
|
wrong += 1
|
||||||
|
else:
|
||||||
|
unclear += 1
|
||||||
|
details[qid]['ans_fail_parse'] = True
|
||||||
|
|
||||||
|
assert total == correct + wrong + unclear
|
||||||
|
results = {
|
||||||
|
'correct_ratio': correct / total * 100,
|
||||||
|
'incorrect_ratio': wrong / total * 100,
|
||||||
|
'ans_fail_parse_ratio': unclear / total * 100,
|
||||||
|
'details': details
|
||||||
|
}
|
||||||
|
return results
|
@ -1,2 +1,3 @@
|
|||||||
from .HuMatchingFIB import * # noqa: F401, F403
|
from .HuMatchingFIB import * # noqa: F401, F403
|
||||||
|
from .HuProverbRea import * # noqa: F401, F403
|
||||||
from .HuStandardFIB import * # noqa: F401, F403
|
from .HuStandardFIB import * # noqa: F401, F403
|
Loading…
Reference in New Issue
Block a user