diff --git a/examples/eval_OpenHuEval_HuProverbRea_2CQ.py b/examples/eval_OpenHuEval_HuProverbRea_2CQ.py index e6040d35..66069983 100644 --- a/examples/eval_OpenHuEval_HuProverbRea_2CQ.py +++ b/examples/eval_OpenHuEval_HuProverbRea_2CQ.py @@ -2,13 +2,13 @@ from mmengine.config import read_base with read_base(): from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets - - # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model + + from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model + # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model - # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model datasets = HuProverbRea_datasets diff --git a/examples/eval_OpenHuEval_HuProverbRea_OE.py b/examples/eval_OpenHuEval_HuProverbRea_OE.py index 1b15a15c..792037b9 100644 --- a/examples/eval_OpenHuEval_HuProverbRea_OE.py +++ b/examples/eval_OpenHuEval_HuProverbRea_OE.py @@ -2,13 +2,13 @@ from mmengine.config import read_base with read_base(): from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model - # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model + + from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model - # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model datasets = HuProverbRea_datasets diff --git a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py index 78d58565..d5192243 100644 --- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py +++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py @@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ with read_base(): - from .prompts import INSTRUCTIONS_DIRECT_QA + from .HuProverbRea_setting import INSTRUCTIONS_DIRECT_QA, DATA_PATH, DATA_VERSION # currently we use English prompts with hu proverbs inserted prompt_template_language = 'en' -dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127' -HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'], - output_column='out') +HuProverbRea_reader_cfg = dict( + input_columns=[ + 'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2' + ], + output_column='out', +) HuProverbRea_datasets = [] instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language] @@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict( template=dict( begin='', round=[ - dict( - role='HUMAN', - prompt=instruction - ), + dict(role='HUMAN', prompt=instruction), ], ), ice_token='', @@ -39,11 +39,11 @@ HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ)) HuProverbRea_datasets.append( dict( - abbr=f'HuProverbRea_2CQ_{prompt_template_language}', + abbr= + f'HuProverbRea_{DATA_VERSION}_2CQ-prompt_{prompt_template_language}', type=HuProverbDataset2CQ, - path=dataset_path, + filepath=DATA_PATH, reader_cfg=HuProverbRea_reader_cfg, infer_cfg=HuProverbRea_infer_cfg, eval_cfg=HuProverbRea_eval_cfg, - ) -) + )) diff --git a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py index b1cde6c1..9484d05f 100644 --- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py +++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py @@ -6,14 +6,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE with read_base(): - from .prompts import INSTRUCTIONS_OE_DIR_QA + from .HuProverbRea_setting import INSTRUCTIONS_OE_DIR_QA, DATA_PATH, DATA_VERSION, judge_prompt_template # currently we use English prompts with hu proverbs inserted prompt_template_language = 'en' -dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127' -HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'], - output_column='out') +HuProverbRea_reader_cfg = dict( + input_columns=[ + 'hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2' + ], + output_column='out', +) HuProverbRea_datasets = [] instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language] @@ -23,10 +26,7 @@ HuProverbRea_infer_cfg = dict( template=dict( begin='', round=[ - dict( - role='HUMAN', - prompt=instruction - ), + dict(role='HUMAN', prompt=instruction), ], ), ice_token='', @@ -35,15 +35,18 @@ HuProverbRea_infer_cfg = dict( inferencer=dict(type=GenInferencer), ) -HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE)) +HuProverbRea_eval_cfg = dict(evaluator=dict( + type=HuProverb_Evaluator_OE, + judge_prompt_template=judge_prompt_template, +)) HuProverbRea_datasets.append( dict( - abbr=f'HuProverbRea_OE_{prompt_template_language}', + abbr= + f'HuProverbRea_{DATA_VERSION}_OE-prompt_{prompt_template_language}', type=HuProverbDatasetOE, - path=dataset_path, + filepath=DATA_PATH, reader_cfg=HuProverbRea_reader_cfg, infer_cfg=HuProverbRea_infer_cfg, eval_cfg=HuProverbRea_eval_cfg, - ) -) + )) diff --git a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/prompts.py b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_setting.py similarity index 94% rename from opencompass/configs/datasets/OpenHuEval/HuProverbRea/prompts.py rename to opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_setting.py index 43839eea..09d76d29 100644 --- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/prompts.py +++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_setting.py @@ -1,3 +1,5 @@ +# yapf: disable + INSTRUCTIONS_DIRECT_QA = { 'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' + '######################\n' + @@ -68,3 +70,8 @@ judge_prompt_template = { '[The end of the second analysis]\n\n' + 'Your decision:' } + + +OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval' +DATA_VERSION = '250127' +DATA_PATH = f'{OpenHuEval_Path}/data/HuProverbRea/HuProverbRea_{DATA_VERSION}/HuProverbRea.jsonl' diff --git a/opencompass/datasets/OpenHuEval/HuProverbRea.py b/opencompass/datasets/OpenHuEval/HuProverbRea.py index aa030c3b..8b9ff546 100644 --- a/opencompass/datasets/OpenHuEval/HuProverbRea.py +++ b/opencompass/datasets/OpenHuEval/HuProverbRea.py @@ -3,8 +3,6 @@ import os from datasets import Dataset, DatasetDict -from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \ - judge_prompt_template from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.utils.prompt import PromptList @@ -14,12 +12,10 @@ from ..base import BaseDataset class HuProverbDataset2CQ(BaseDataset): @staticmethod - def load(**kwargs): - path = kwargs.get('path', None) + def load(filepath): + assert os.path.isfile(filepath) dataset = DatasetDict() - sub_dataset_file = os.path.join(path, - '{}.jsonl'.format('HuProverbRea')) - f = open(sub_dataset_file, 'r', encoding='utf-8') + f = open(filepath, 'r', encoding='utf-8') lines = f.readlines() out_dict_list = [] for line in lines: @@ -46,7 +42,7 @@ class HuProverbDataset2CQ(BaseDataset): 'option2': w_ops, 'out': { 'true_ans': '1', - 'id': obj['qid'], + 'qid': obj['qid'], 'source_id': obj['source_info']['source_id'], 'en_expl': obj['source_info']['en_expl'], 'en_trans': obj['source_info']['en_trans'], @@ -69,7 +65,7 @@ class HuProverbDataset2CQ(BaseDataset): 'option2': cor_ops, 'out': { 'true_ans': '2', - 'id': obj['qid'], + 'qid': obj['qid'], 'source_id': obj['source_info']['source_id'], 'en_expl': obj['source_info']['en_expl'], 'en_trans': obj['source_info']['en_trans'], @@ -93,12 +89,10 @@ class HuProverbDataset2CQ(BaseDataset): class HuProverbDatasetOE(BaseDataset): @staticmethod - def load(**kwargs): - path = kwargs.get('path', None) + def load(filepath): + assert os.path.isfile(filepath) dataset = DatasetDict() - sub_dataset_file = os.path.join(path, - '{}.jsonl'.format('HuProverbRea')) - f = open(sub_dataset_file, 'r', encoding='utf-8') + f = open(filepath, 'r', encoding='utf-8') lines = f.readlines() out_dict_list = [] for line in lines: @@ -121,7 +115,7 @@ class HuProverbDatasetOE(BaseDataset): 'en_expl': obj['source_info']['en_expl'], 'hu_expl': obj['source_info']['hu_expl'], 'out': { - 'id': obj['qid'], + 'qid': obj['qid'], 'source_id': obj['source_info']['source_id'], 'en_expl': obj['source_info']['en_expl'], 'en_trans': obj['source_info']['en_trans'], @@ -152,7 +146,7 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator): total, correct, incorrect, fail_parse = 0, 0, 0, 0 for raw_pred, detail, ori_prompt in zip(predictions, references, origin_prompt): - idx = detail['id'] + qid = detail['qid'] option1 = detail['option1'] option2 = detail['option2'] true_ans = detail['true_ans'] @@ -181,9 +175,9 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator): else: res_of_this_round['is_incorrect'] = True - if idx not in details: + if qid not in details: total += 1 - details[idx] = { + details[qid] = { 'detail': { 'hu_text': detail['hu_text'], 'en_trans': detail['en_trans'], @@ -199,21 +193,21 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator): 'is_fail_parse': False } else: - details[idx]['flipped_variance'].append(res_of_this_round) + details[qid]['flipped_variance'].append(res_of_this_round) # judge the results - if details[idx]['flipped_variance'][0][ - 'is_correct'] and details[idx]['flipped_variance'][1][ + if details[qid]['flipped_variance'][0][ + 'is_correct'] and details[qid]['flipped_variance'][1][ 'is_correct']: correct += 1 - details[idx]['is_correct'] = True - elif details[idx]['flipped_variance'][0][ - 'is_fail_parse'] or details[idx]['flipped_variance'][ + details[qid]['is_correct'] = True + elif details[qid]['flipped_variance'][0][ + 'is_fail_parse'] or details[qid]['flipped_variance'][ 1]['is_fail_parse']: fail_parse += 1 - details[idx]['is_fail_parse'] = True + details[qid]['is_fail_parse'] = True else: incorrect += 1 - details[idx]['is_incorrect'] = True + details[qid]['is_incorrect'] = True assert total == correct + incorrect + fail_parse results = { @@ -227,9 +221,16 @@ class HuProverb_Evaluator_2CQ(BaseEvaluator): class HuProverb_Evaluator_OE(BaseEvaluator): - """ - ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator - """ + + def __init__(self, + judge_prompt_template, + openai_key='ENV', + openai_proxy_url='ENV', + **kwargs): + super().__init__(**kwargs) + self.judge_prompt_template = judge_prompt_template + self.openai_key = openai_key + self.openai_proxy_url = openai_proxy_url def score(self, predictions, references, origin_prompt) -> dict: @@ -239,13 +240,16 @@ class HuProverb_Evaluator_OE(BaseEvaluator): details = {} total, correct, wrong, unclear = 0, 0, 0, 0 from opencompass.models import OpenAI - model = OpenAI(path='gpt-4o', + model = OpenAI(path='gpt-4o-2024-08-06', + key=self.openai_key, + openai_proxy_url=self.openai_proxy_url, max_seq_len=8192, retry=2, - temperature=0.1) + temperature=0, + verbose=True) for raw_pred, detail in zip(predictions, references): total += 1 - qid = detail['id'] + qid = detail['qid'] details[qid] = { 'proverb': detail['hu_text'], 'conversation': detail['context'], @@ -256,12 +260,12 @@ class HuProverb_Evaluator_OE(BaseEvaluator): } # ------------------------------------------- openai judge - user_prompt = judge_prompt_template['en_user'].format( + user_prompt = self.judge_prompt_template['en_user'].format( proverb=detail['hu_text'], conversation=detail['context'], answer=detail['correct'], raw_pred=raw_pred) - system_prompt = judge_prompt_template['en_system'] + system_prompt = self.judge_prompt_template['en_system'] details[qid]['judge_user_prompt'] = user_prompt messages = PromptList([{