mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update HuProverb config and eval
This commit is contained in:
parent
7586186897
commit
08712f49f2
@ -3,12 +3,12 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets
|
||||
|
||||
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||
|
||||
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
|
||||
|
||||
datasets = HuProverbRea_datasets
|
||||
|
@ -2,13 +2,13 @@ from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
|
||||
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||
|
||||
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||
|
||||
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
|
||||
|
||||
datasets = HuProverbRea_datasets
|
||||
|
@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ
|
||||
|
||||
with read_base():
|
||||
from .prompts import INSTRUCTIONS_DIRECT_QA
|
||||
from .HuProverbRea_setting import INSTRUCTIONS_DIRECT_QA, DATA_PATH, DATA_VERSION
|
||||
|
||||
# currently we use English prompts with hu proverbs inserted
|
||||
prompt_template_language = 'en'
|
||||
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
|
||||
|
||||
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
|
||||
output_column='out')
|
||||
HuProverbRea_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'
|
||||
],
|
||||
output_column='out',
|
||||
)
|
||||
|
||||
HuProverbRea_datasets = []
|
||||
instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language]
|
||||
@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict(
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=instruction
|
||||
),
|
||||
dict(role='HUMAN', prompt=instruction),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
@ -39,11 +39,11 @@ HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ))
|
||||
|
||||
HuProverbRea_datasets.append(
|
||||
dict(
|
||||
abbr=f'HuProverbRea_2CQ_{prompt_template_language}',
|
||||
abbr=
|
||||
f'HuProverbRea_{DATA_VERSION}_2CQ-prompt_{prompt_template_language}',
|
||||
type=HuProverbDataset2CQ,
|
||||
path=dataset_path,
|
||||
filepath=DATA_PATH,
|
||||
reader_cfg=HuProverbRea_reader_cfg,
|
||||
infer_cfg=HuProverbRea_infer_cfg,
|
||||
eval_cfg=HuProverbRea_eval_cfg,
|
||||
)
|
||||
)
|
||||
))
|
||||
|
@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE
|
||||
|
||||
with read_base():
|
||||
from .prompts import INSTRUCTIONS_OE_DIR_QA
|
||||
from .HuProverbRea_setting import INSTRUCTIONS_OE_DIR_QA, DATA_PATH, DATA_VERSION, judge_prompt_template
|
||||
|
||||
# currently we use English prompts with hu proverbs inserted
|
||||
prompt_template_language = 'en'
|
||||
dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
|
||||
|
||||
HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
|
||||
output_column='out')
|
||||
HuProverbRea_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'
|
||||
],
|
||||
output_column='out',
|
||||
)
|
||||
|
||||
HuProverbRea_datasets = []
|
||||
instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language]
|
||||
@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict(
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=instruction
|
||||
),
|
||||
dict(role='HUMAN', prompt=instruction),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
@ -35,15 +35,18 @@ HuProverbRea_infer_cfg = dict(
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE))
|
||||
HuProverbRea_eval_cfg = dict(evaluator=dict(
|
||||
type=HuProverb_Evaluator_OE,
|
||||
judge_prompt_template=judge_prompt_template,
|
||||
))
|
||||
|
||||
HuProverbRea_datasets.append(
|
||||
dict(
|
||||
abbr=f'HuProverbRea_OE_{prompt_template_language}',
|
||||
abbr=
|
||||
f'HuProverbRea_{DATA_VERSION}_OE-prompt_{prompt_template_language}',
|
||||
type=HuProverbDatasetOE,
|
||||
path=dataset_path,
|
||||
filepath=DATA_PATH,
|
||||
reader_cfg=HuProverbRea_reader_cfg,
|
||||
infer_cfg=HuProverbRea_infer_cfg,
|
||||
eval_cfg=HuProverbRea_eval_cfg,
|
||||
)
|
||||
)
|
||||
))
|
||||
|
@ -1,3 +1,5 @@
|
||||
# yapf: disable
|
||||
|
||||
INSTRUCTIONS_DIRECT_QA = {
|
||||
'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
|
||||
'######################\n' +
|
||||
@ -68,3 +70,8 @@ judge_prompt_template = {
|
||||
'[The end of the second analysis]\n\n' +
|
||||
'Your decision:'
|
||||
}
|
||||
|
||||
|
||||
OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval'
|
||||
DATA_VERSION = '250127'
|
||||
DATA_PATH = f'{OpenHuEval_Path}/data/HuProverbRea/HuProverbRea_{DATA_VERSION}/HuProverbRea.jsonl'
|
@ -3,8 +3,6 @@ import os
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \
|
||||
judge_prompt_template
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
@ -14,12 +12,10 @@ from ..base import BaseDataset
|
||||
class HuProverbDataset2CQ(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
path = kwargs.get('path', None)
|
||||
def load(filepath):
|
||||
assert os.path.isfile(filepath)
|
||||
dataset = DatasetDict()
|
||||
sub_dataset_file = os.path.join(path,
|
||||
'{}.jsonl'.format('HuProverbRea'))
|
||||
f = open(sub_dataset_file, 'r', encoding='utf-8')
|
||||
f = open(filepath, 'r', encoding='utf-8')
|
||||
lines = f.readlines()
|
||||
out_dict_list = []
|
||||
for line in lines:
|
||||
@ -46,7 +42,7 @@ class HuProverbDataset2CQ(BaseDataset):
|
||||
'option2': w_ops,
|
||||
'out': {
|
||||
'true_ans': '1',
|
||||
'id': obj['qid'],
|
||||
'qid': obj['qid'],
|
||||
'source_id': obj['source_info']['source_id'],
|
||||
'en_expl': obj['source_info']['en_expl'],
|
||||
'en_trans': obj['source_info']['en_trans'],
|
||||
@ -69,7 +65,7 @@ class HuProverbDataset2CQ(BaseDataset):
|
||||
'option2': cor_ops,
|
||||
'out': {
|
||||
'true_ans': '2',
|
||||
'id': obj['qid'],
|
||||
'qid': obj['qid'],
|
||||
'source_id': obj['source_info']['source_id'],
|
||||
'en_expl': obj['source_info']['en_expl'],
|
||||
'en_trans': obj['source_info']['en_trans'],
|
||||
@ -93,12 +89,10 @@ class HuProverbDataset2CQ(BaseDataset):
|
||||
class HuProverbDatasetOE(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
path = kwargs.get('path', None)
|
||||
def load(filepath):
|
||||
assert os.path.isfile(filepath)
|
||||
dataset = DatasetDict()
|
||||
sub_dataset_file = os.path.join(path,
|
||||
'{}.jsonl'.format('HuProverbRea'))
|
||||
f = open(sub_dataset_file, 'r', encoding='utf-8')
|
||||
f = open(filepath, 'r', encoding='utf-8')
|
||||
lines = f.readlines()
|
||||
out_dict_list = []
|
||||
for line in lines:
|
||||
@ -121,7 +115,7 @@ class HuProverbDatasetOE(BaseDataset):
|
||||
'en_expl': obj['source_info']['en_expl'],
|
||||
'hu_expl': obj['source_info']['hu_expl'],
|
||||
'out': {
|
||||
'id': obj['qid'],
|
||||
'qid': obj['qid'],
|
||||
'source_id': obj['source_info']['source_id'],
|
||||
'en_expl': obj['source_info']['en_expl'],
|
||||
'en_trans': obj['source_info']['en_trans'],
|
||||
@ -152,7 +146,7 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
|
||||
total, correct, incorrect, fail_parse = 0, 0, 0, 0
|
||||
for raw_pred, detail, ori_prompt in zip(predictions, references,
|
||||
origin_prompt):
|
||||
idx = detail['id']
|
||||
qid = detail['qid']
|
||||
option1 = detail['option1']
|
||||
option2 = detail['option2']
|
||||
true_ans = detail['true_ans']
|
||||
@ -181,9 +175,9 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
|
||||
else:
|
||||
res_of_this_round['is_incorrect'] = True
|
||||
|
||||
if idx not in details:
|
||||
if qid not in details:
|
||||
total += 1
|
||||
details[idx] = {
|
||||
details[qid] = {
|
||||
'detail': {
|
||||
'hu_text': detail['hu_text'],
|
||||
'en_trans': detail['en_trans'],
|
||||
@ -199,21 +193,21 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
|
||||
'is_fail_parse': False
|
||||
}
|
||||
else:
|
||||
details[idx]['flipped_variance'].append(res_of_this_round)
|
||||
details[qid]['flipped_variance'].append(res_of_this_round)
|
||||
# judge the results
|
||||
if details[idx]['flipped_variance'][0][
|
||||
'is_correct'] and details[idx]['flipped_variance'][1][
|
||||
if details[qid]['flipped_variance'][0][
|
||||
'is_correct'] and details[qid]['flipped_variance'][1][
|
||||
'is_correct']:
|
||||
correct += 1
|
||||
details[idx]['is_correct'] = True
|
||||
elif details[idx]['flipped_variance'][0][
|
||||
'is_fail_parse'] or details[idx]['flipped_variance'][
|
||||
details[qid]['is_correct'] = True
|
||||
elif details[qid]['flipped_variance'][0][
|
||||
'is_fail_parse'] or details[qid]['flipped_variance'][
|
||||
1]['is_fail_parse']:
|
||||
fail_parse += 1
|
||||
details[idx]['is_fail_parse'] = True
|
||||
details[qid]['is_fail_parse'] = True
|
||||
else:
|
||||
incorrect += 1
|
||||
details[idx]['is_incorrect'] = True
|
||||
details[qid]['is_incorrect'] = True
|
||||
|
||||
assert total == correct + incorrect + fail_parse
|
||||
results = {
|
||||
@ -227,9 +221,16 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator):
|
||||
|
||||
|
||||
class HuProverb_Evaluator_OE(BaseEvaluator):
|
||||
"""
|
||||
ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
judge_prompt_template,
|
||||
openai_key='ENV',
|
||||
openai_proxy_url='ENV',
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.judge_prompt_template = judge_prompt_template
|
||||
self.openai_key = openai_key
|
||||
self.openai_proxy_url = openai_proxy_url
|
||||
|
||||
def score(self, predictions, references, origin_prompt) -> dict:
|
||||
|
||||
@ -239,13 +240,16 @@ class HuProverb_Evaluator_OE(BaseEvaluator):
|
||||
details = {}
|
||||
total, correct, wrong, unclear = 0, 0, 0, 0
|
||||
from opencompass.models import OpenAI
|
||||
model = OpenAI(path='gpt-4o',
|
||||
model = OpenAI(path='gpt-4o-2024-08-06',
|
||||
key=self.openai_key,
|
||||
openai_proxy_url=self.openai_proxy_url,
|
||||
max_seq_len=8192,
|
||||
retry=2,
|
||||
temperature=0.1)
|
||||
temperature=0,
|
||||
verbose=True)
|
||||
for raw_pred, detail in zip(predictions, references):
|
||||
total += 1
|
||||
qid = detail['id']
|
||||
qid = detail['qid']
|
||||
details[qid] = {
|
||||
'proverb': detail['hu_text'],
|
||||
'conversation': detail['context'],
|
||||
@ -256,12 +260,12 @@ class HuProverb_Evaluator_OE(BaseEvaluator):
|
||||
}
|
||||
|
||||
# ------------------------------------------- openai judge
|
||||
user_prompt = judge_prompt_template['en_user'].format(
|
||||
user_prompt = self.judge_prompt_template['en_user'].format(
|
||||
proverb=detail['hu_text'],
|
||||
conversation=detail['context'],
|
||||
answer=detail['correct'],
|
||||
raw_pred=raw_pred)
|
||||
system_prompt = judge_prompt_template['en_system']
|
||||
system_prompt = self.judge_prompt_template['en_system']
|
||||
details[qid]['judge_user_prompt'] = user_prompt
|
||||
|
||||
messages = PromptList([{
|
||||
|
Loading…
Reference in New Issue
Block a user