add HuProverbRea dataset (20250203)

This commit is contained in:
gaojunyuan 2025-02-03 21:36:08 +08:00 committed by jxd
parent 794ab7c372
commit f152ccf127
7 changed files with 497 additions and 1 deletions

View File

@ -0,0 +1,16 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
datasets = HuProverbRea_datasets
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable

View File

@ -0,0 +1,16 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
datasets = HuProverbRea_datasets
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable

View File

@ -0,0 +1,49 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ
with read_base():
from .prompts import INSTRUCTIONS_DIRECT_QA
# currently we use English prompts with hu proverbs inserted
prompt_template_language = 'en'
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
output_column='out')
HuProverbRea_datasets = []
instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language]
HuProverbRea_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=instruction
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ))
HuProverbRea_datasets.append(
dict(
abbr=f'HuProverbRea_2CQ_{prompt_template_language}',
type=HuProverbDataset2CQ,
path=dataset_path,
reader_cfg=HuProverbRea_reader_cfg,
infer_cfg=HuProverbRea_infer_cfg,
eval_cfg=HuProverbRea_eval_cfg,
)
)

View File

@ -0,0 +1,49 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE
with read_base():
from .prompts import INSTRUCTIONS_OE_DIR_QA
# currently we use English prompts with hu proverbs inserted
prompt_template_language = 'en'
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
output_column='out')
HuProverbRea_datasets = []
instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language]
HuProverbRea_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=instruction
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE))
HuProverbRea_datasets.append(
dict(
abbr=f'HuProverbRea_OE_{prompt_template_language}',
type=HuProverbDatasetOE,
path=dataset_path,
reader_cfg=HuProverbRea_reader_cfg,
infer_cfg=HuProverbRea_infer_cfg,
eval_cfg=HuProverbRea_eval_cfg,
)
)

View File

@ -0,0 +1,70 @@
INSTRUCTIONS_DIRECT_QA = {
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
'######################\n' +
'Hungarian Phrase:\n' +
'----------------------\n' +
"'{hu_text}'\n" +
'######################\n\n' +
'and a context using this phrase:\n\n' +
'######################\n' +
'Hungarian Context:\n' +
'----------------------\n' +
'{context}\n' +
'######################\n\n' +
'What does the person mean by using this phrase? Please select one correct answer from the following two options:\n\n' +
'######################\n' +
'Options:\n' +
'----------------------\n' +
'Option 1: {option1}\n' +
'Option 2: {option2}\n' +
'######################\n\n' +
"You should only answer the option number, '1' or '2'. Do not output any other content other than the option number. Your answer:"
}
INSTRUCTIONS_OE_DIR_QA = {
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
'######################\n' +
'Hungarian Phrase:\n' +
'----------------------\n' +
"'{hu_text}'\n" +
'######################\n\n' +
'and a context using this phrase:\n\n' +
'######################\n' +
'Hungarian Context:\n' +
'----------------------\n' +
'{context}\n' +
'######################\n\n' +
'What does the person mean by using this phrase? Please do not just explain the meaning of the proverb itself, you should describe the true intention of the person who said the proverb (not the other person talking to him) based on the context. Please answer concisely in one sentence:',
'hu': 'Ön magyar nyelvi szakértő. Adott egy magyar kifejezés:\n\n' +
'######################\n' +
'Magyar kifejezés:\n' +
'----------------------\n' +
"'{hu_text}'\n" +
'######################\n\n' +
'és egy szövegkörnyezet, amely ezt a kifejezést használja:\n\n' +
'######################\n' +
'Magyar kontextus:\n' +
'----------------------\n' +
'{context}\n' +
'######################\n\n' +
'Mire gondol az illető, amikor ezt a kifejezést használja? Kérjük, ne csak magának a közmondásnak a jelentését magyarázza meg, hanem a szövegkörnyezet alapján írja le a közmondást kimondó személy (nem a vele beszélgető másik személy) valódi szándékát. Kérjük, válaszoljon tömören, egy mondatban:'
}
judge_prompt_template = {
'en_system':
"Please act as an impartial judge specialized in Hungarian language and culture. Given a Hungarian saying, a context using that saying, and two analyses explaining 'what does the person mean by using that saying in the context?', please decide whether the given two analyses express the same meaning. If they reflect the same understanding of the saying's meaning, you should answer YES. If they are based on different interpretations of the saying, you should answer NO. Do not output anything other than 'YES' or 'NO'. Avoid any position biases and ensure that the order in which the analyses were presented does not influence your decision. Do not allow the length of the analyses to influence your judge, focus on their core meanings and their understandings of the Hungarian saying.",
'en_user':
'[The start of Hungarian saying]\n' +
'{proverb}\n' +
'[The end of Hungarian saying]\n\n' +
'[The start of the context]\n' +
'{conversation}\n' +
'[The end of the context]\n\n' +
'[The start of the first analysis]\n' +
'{answer}\n' +
'[The end of the first analysis]\n\n' +
'[The start of the second analysis]\n' +
'{raw_pred}\n'+
'[The end of the second analysis]\n\n' +
'Your decision:'
}

View File

@ -0,0 +1,295 @@
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \
judge_prompt_template
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils.prompt import PromptList
from ..base import BaseDataset
class HuProverbDataset2CQ(BaseDataset):
@staticmethod
def load(**kwargs):
path = kwargs.get('path', None)
dataset = DatasetDict()
sub_dataset_file = os.path.join(path,
'{}.jsonl'.format('HuProverbRea'))
f = open(sub_dataset_file, 'r', encoding='utf-8')
lines = f.readlines()
out_dict_list = []
for line in lines:
obj = json.loads(line)
if len(obj['context']) > 1:
obj['context'] = '\n'.join(
[x.strip() for x in obj['context'] if x])
else:
obj['context'] = obj['context'][0]
if obj['answer'] == 0:
cor_ops = obj['options'][0]
w_ops = obj['options'][1]
else:
cor_ops = obj['options'][1]
w_ops = obj['options'][0]
new_obj_1 = {
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'en_expl': obj['source_info']['en_expl'],
'hu_expl': obj['source_info']['hu_expl'],
'option1': cor_ops,
'option2': w_ops,
'out': {
'true_ans': '1',
'id': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
'hu_expl': obj['source_info']['hu_expl'],
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'option1': cor_ops,
'option2': w_ops,
'correct': cor_ops,
'incorrect': w_ops
}
}
new_obj_2 = {
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'en_expl': obj['source_info']['en_expl'],
'hu_expl': obj['source_info']['hu_expl'],
'option1': w_ops,
'option2': cor_ops,
'out': {
'true_ans': '2',
'id': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
'hu_expl': obj['source_info']['hu_expl'],
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'option1': w_ops,
'option2': cor_ops,
'correct': cor_ops,
'incorrect': w_ops
}
}
out_dict_list.append(new_obj_1)
out_dict_list.append(new_obj_2)
dataset = Dataset.from_list(out_dict_list)
return dataset
class HuProverbDatasetOE(BaseDataset):
@staticmethod
def load(**kwargs):
path = kwargs.get('path', None)
dataset = DatasetDict()
sub_dataset_file = os.path.join(path,
'{}.jsonl'.format('HuProverbRea'))
f = open(sub_dataset_file, 'r', encoding='utf-8')
lines = f.readlines()
out_dict_list = []
for line in lines:
obj = json.loads(line)
if len(obj['context']) > 1:
obj['context'] = '\n'.join(
[x.strip() for x in obj['context'] if x])
else:
obj['context'] = obj['context'][0]
if obj['answer'] == 0:
cor_ops = obj['options'][0]
w_ops = obj['options'][1]
else:
cor_ops = obj['options'][1]
w_ops = obj['options'][0]
new_obj = {
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'en_expl': obj['source_info']['en_expl'],
'hu_expl': obj['source_info']['hu_expl'],
'out': {
'id': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
'hu_expl': obj['source_info']['hu_expl'],
'hu_text': obj['source_info']['proverb'],
'context': obj['context'],
'correct': cor_ops,
'incorrect': w_ops
}
}
out_dict_list.append(new_obj)
dataset = Dataset.from_list(out_dict_list)
return dataset
class HuProverb_Evaluator_2CQ(BaseEvaluator):
"""
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
"""
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
total, correct, incorrect, fail_parse = 0, 0, 0, 0
for raw_pred, detail, ori_prompt in zip(predictions, references,
origin_prompt):
idx = detail['id']
option1 = detail['option1']
option2 = detail['option2']
true_ans = detail['true_ans']
res_of_this_round = {
'origin_prompt': ori_prompt,
'is_correct': False,
'is_incorrect': False,
'is_fail_parse': False,
'option1': option1,
'option2': option2,
'true_ans': true_ans,
'raw_pred': raw_pred
}
# parse ans from raw pred
if '1' in raw_pred and '2' not in raw_pred:
ans = '1'
elif '2' in raw_pred and '1' not in raw_pred:
ans = '2'
else:
ans = ''
res_of_this_round['parsed_pred'] = ans
if ans == true_ans:
res_of_this_round['is_correct'] = True
elif ans == '':
res_of_this_round['is_fail_parse'] = True
else:
res_of_this_round['is_incorrect'] = True
if idx not in details:
total += 1
details[idx] = {
'detail': {
'hu_text': detail['hu_text'],
'en_trans': detail['en_trans'],
'en_expl': detail['en_expl'],
'hu_expl': detail['hu_expl'],
'context': detail['context'],
'correct': detail['correct'],
'incorrect': detail['incorrect']
},
'flipped_variance': [res_of_this_round],
'is_correct': False,
'is_incorrect': False,
'is_fail_parse': False
}
else:
details[idx]['flipped_variance'].append(res_of_this_round)
# judge the results
if details[idx]['flipped_variance'][0][
'is_correct'] and details[idx]['flipped_variance'][1][
'is_correct']:
correct += 1
details[idx]['is_correct'] = True
elif details[idx]['flipped_variance'][0][
'is_fail_parse'] or details[idx]['flipped_variance'][
1]['is_fail_parse']:
fail_parse += 1
details[idx]['is_fail_parse'] = True
else:
incorrect += 1
details[idx]['is_incorrect'] = True
assert total == correct + incorrect + fail_parse
results = {
'correct_ratio': correct / total * 100,
'incorrect_ratio': incorrect / total * 100,
'fail_parse_ratio': fail_parse / total * 100,
'details': details
}
return results
class HuProverb_Evaluator_OE(BaseEvaluator):
"""
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
"""
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
total, correct, wrong, unclear = 0, 0, 0, 0
from opencompass.models import OpenAI
model = OpenAI(path='gpt-4o',
max_seq_len=8192,
retry=2,
temperature=0.1)
for raw_pred, detail in zip(predictions, references):
total += 1
qid = detail['id']
details[qid] = {
'proverb': detail['hu_text'],
'conversation': detail['context'],
'answer': detail['correct'],
'raw_pred': raw_pred,
'correctness': False,
'ans_fail_parse': False
}
# ------------------------------------------- openai judge
user_prompt = judge_prompt_template['en_user'].format(
proverb=detail['hu_text'],
conversation=detail['context'],
answer=detail['correct'],
raw_pred=raw_pred)
system_prompt = judge_prompt_template['en_system']
details[qid]['judge_user_prompt'] = user_prompt
messages = PromptList([{
'role': 'SYSTEM',
'prompt': system_prompt,
}, {
'role': 'HUMAN',
'prompt': user_prompt,
}])
response = model._generate(input=messages,
max_out_len=8192,
temperature=0.1)
details[qid]['judge_resp'] = response
if 'yes' in response.lower() and 'no' not in response.lower():
correct += 1
details[qid]['correctness'] = True
elif 'no' in response.lower() and 'yes' not in response.lower():
wrong += 1
else:
unclear += 1
details[qid]['ans_fail_parse'] = True
assert total == correct + wrong + unclear
results = {
'correct_ratio': correct / total * 100,
'incorrect_ratio': wrong / total * 100,
'ans_fail_parse_ratio': unclear / total * 100,
'details': details
}
return results

View File

@ -1,2 +1,3 @@
from .HuMatchingFIB import * # noqa: F401, F403 from .HuMatchingFIB import * # noqa: F401, F403
from .HuProverbRea import * # noqa: F401, F403
from .HuStandardFIB import * # noqa: F401, F403 from .HuStandardFIB import * # noqa: F401, F403