update HuProverb config and eval

This commit is contained in:
wujiang 2025-02-04 15:18:40 +08:00 committed by jxd
parent 7586186897
commit 08712f49f2
6 changed files with 80 additions and 66 deletions

View File

@ -2,13 +2,13 @@ from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
datasets = HuProverbRea_datasets

View File

@ -2,13 +2,13 @@ from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
datasets = HuProverbRea_datasets

View File

@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ
with read_base():
from .prompts import INSTRUCTIONS_DIRECT_QA
from .HuProverbRea_setting import INSTRUCTIONS_DIRECT_QA, DATA_PATH, DATA_VERSION
# currently we use English prompts with hu proverbs inserted
prompt_template_language = 'en'
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
output_column='out')
HuProverbRea_reader_cfg = dict(
input_columns=[
'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'
],
output_column='out',
)
HuProverbRea_datasets = []
instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language]
@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict(
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=instruction
),
dict(role='HUMAN', prompt=instruction),
],
),
ice_token='</E>',
@ -39,11 +39,11 @@ HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ))
HuProverbRea_datasets.append(
dict(
abbr=f'HuProverbRea_2CQ_{prompt_template_language}',
abbr=
f'HuProverbRea_{DATA_VERSION}_2CQ-prompt_{prompt_template_language}',
type=HuProverbDataset2CQ,
path=dataset_path,
filepath=DATA_PATH,
reader_cfg=HuProverbRea_reader_cfg,
infer_cfg=HuProverbRea_infer_cfg,
eval_cfg=HuProverbRea_eval_cfg,
)
)
))

View File

@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE
with read_base():
from .prompts import INSTRUCTIONS_OE_DIR_QA
from .HuProverbRea_setting import INSTRUCTIONS_OE_DIR_QA, DATA_PATH, DATA_VERSION, judge_prompt_template
# currently we use English prompts with hu proverbs inserted
prompt_template_language = 'en'
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
output_column='out')
HuProverbRea_reader_cfg = dict(
input_columns=[
'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'
],
output_column='out',
)
HuProverbRea_datasets = []
instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language]
@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict(
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=instruction
),
dict(role='HUMAN', prompt=instruction),
],
),
ice_token='</E>',
@ -35,15 +35,18 @@ HuProverbRea_infer_cfg = dict(
inferencer=dict(type=GenInferencer),
)
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE))
HuProverbRea_eval_cfg = dict(evaluator=dict(
type=HuProverb_Evaluator_OE,
judge_prompt_template=judge_prompt_template,
))
HuProverbRea_datasets.append(
dict(
abbr=f'HuProverbRea_OE_{prompt_template_language}',
abbr=
f'HuProverbRea_{DATA_VERSION}_OE-prompt_{prompt_template_language}',
type=HuProverbDatasetOE,
path=dataset_path,
filepath=DATA_PATH,
reader_cfg=HuProverbRea_reader_cfg,
infer_cfg=HuProverbRea_infer_cfg,
eval_cfg=HuProverbRea_eval_cfg,
)
)
))

View File

@ -1,3 +1,5 @@
# yapf: disable
INSTRUCTIONS_DIRECT_QA = {
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
'######################\n' +
@ -68,3 +70,8 @@ judge_prompt_template = {
'[The end of the second analysis]\n\n' +
'Your decision:'
}
OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval'
DATA_VERSION = '250127'
DATA_PATH = f'{OpenHuEval_Path}/data/HuProverbRea/HuProverbRea_{DATA_VERSION}/HuProverbRea.jsonl'

View File

@ -3,8 +3,6 @@ import os
from datasets import Dataset, DatasetDict
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \
judge_prompt_template
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils.prompt import PromptList
@ -14,12 +12,10 @@ from ..base import BaseDataset
class HuProverbDataset2CQ(BaseDataset):
@staticmethod
def load(**kwargs):
path = kwargs.get('path', None)
def load(filepath):
assert os.path.isfile(filepath)
dataset = DatasetDict()
sub_dataset_file = os.path.join(path,
'{}.jsonl'.format('HuProverbRea'))
f = open(sub_dataset_file, 'r', encoding='utf-8')
f = open(filepath, 'r', encoding='utf-8')
lines = f.readlines()
out_dict_list = []
for line in lines:
@ -46,7 +42,7 @@ class HuProverbDataset2CQ(BaseDataset):
'option2': w_ops,
'out': {
'true_ans': '1',
'id': obj['qid'],
'qid': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
@ -69,7 +65,7 @@ class HuProverbDataset2CQ(BaseDataset):
'option2': cor_ops,
'out': {
'true_ans': '2',
'id': obj['qid'],
'qid': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
@ -93,12 +89,10 @@ class HuProverbDataset2CQ(BaseDataset):
class HuProverbDatasetOE(BaseDataset):
@staticmethod
def load(**kwargs):
path = kwargs.get('path', None)
def load(filepath):
assert os.path.isfile(filepath)
dataset = DatasetDict()
sub_dataset_file = os.path.join(path,
'{}.jsonl'.format('HuProverbRea'))
f = open(sub_dataset_file, 'r', encoding='utf-8')
f = open(filepath, 'r', encoding='utf-8')
lines = f.readlines()
out_dict_list = []
for line in lines:
@ -121,7 +115,7 @@ class HuProverbDatasetOE(BaseDataset):
'en_expl': obj['source_info']['en_expl'],
'hu_expl': obj['source_info']['hu_expl'],
'out': {
'id': obj['qid'],
'qid': obj['qid'],
'source_id': obj['source_info']['source_id'],
'en_expl': obj['source_info']['en_expl'],
'en_trans': obj['source_info']['en_trans'],
@ -152,7 +146,7 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
total, correct, incorrect, fail_parse = 0, 0, 0, 0
for raw_pred, detail, ori_prompt in zip(predictions, references,
origin_prompt):
idx = detail['id']
qid = detail['qid']
option1 = detail['option1']
option2 = detail['option2']
true_ans = detail['true_ans']
@ -181,9 +175,9 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
else:
res_of_this_round['is_incorrect'] = True
if idx not in details:
if qid not in details:
total += 1
details[idx] = {
details[qid] = {
'detail': {
'hu_text': detail['hu_text'],
'en_trans': detail['en_trans'],
@ -199,21 +193,21 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
'is_fail_parse': False
}
else:
details[idx]['flipped_variance'].append(res_of_this_round)
details[qid]['flipped_variance'].append(res_of_this_round)
# judge the results
if details[idx]['flipped_variance'][0][
'is_correct'] and details[idx]['flipped_variance'][1][
if details[qid]['flipped_variance'][0][
'is_correct'] and details[qid]['flipped_variance'][1][
'is_correct']:
correct += 1
details[idx]['is_correct'] = True
elif details[idx]['flipped_variance'][0][
'is_fail_parse'] or details[idx]['flipped_variance'][
details[qid]['is_correct'] = True
elif details[qid]['flipped_variance'][0][
'is_fail_parse'] or details[qid]['flipped_variance'][
1]['is_fail_parse']:
fail_parse += 1
details[idx]['is_fail_parse'] = True
details[qid]['is_fail_parse'] = True
else:
incorrect += 1
details[idx]['is_incorrect'] = True
details[qid]['is_incorrect'] = True
assert total == correct + incorrect + fail_parse
results = {
@ -227,9 +221,16 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
class HuProverb_Evaluator_OE(BaseEvaluator):
"""
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
"""
def __init__(self,
judge_prompt_template,
openai_key='ENV',
openai_proxy_url='ENV',
**kwargs):
super().__init__(**kwargs)
self.judge_prompt_template = judge_prompt_template
self.openai_key = openai_key
self.openai_proxy_url = openai_proxy_url
def score(self, predictions, references, origin_prompt) -> dict:
@ -239,13 +240,16 @@ class HuProverb_Evaluator_OE(BaseEvaluator):
details = {}
total, correct, wrong, unclear = 0, 0, 0, 0
from opencompass.models import OpenAI
model = OpenAI(path='gpt-4o',
model = OpenAI(path='gpt-4o-2024-08-06',
key=self.openai_key,
openai_proxy_url=self.openai_proxy_url,
max_seq_len=8192,
retry=2,
temperature=0.1)
temperature=0,
verbose=True)
for raw_pred, detail in zip(predictions, references):
total += 1
qid = detail['id']
qid = detail['qid']
details[qid] = {
'proverb': detail['hu_text'],
'conversation': detail['context'],
@ -256,12 +260,12 @@ class HuProverb_Evaluator_OE(BaseEvaluator):
}
# ------------------------------------------- openai judge
user_prompt = judge_prompt_template['en_user'].format(
user_prompt = self.judge_prompt_template['en_user'].format(
proverb=detail['hu_text'],
conversation=detail['context'],
answer=detail['correct'],
raw_pred=raw_pred)
system_prompt = judge_prompt_template['en_system']
system_prompt = self.judge_prompt_template['en_system']
details[qid]['judge_user_prompt'] = user_prompt
messages = PromptList([{