add HuProverbRea dataset (20250203)

2025-05-30 16:03:24 +08:00 · 2025-02-03 21:36:08 +08:00 · 2025-02-03 21:36:08 +08:00 · f152ccf127
commit f152ccf127
parent 794ab7c372
7 changed files with 497 additions and 1 deletions
--- a/examples/eval_OpenHuEval_HuProverbRea_2CQ.py
+++ b/examples/eval_OpenHuEval_HuProverbRea_2CQ.py
@ -0,0 +1,16 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_2CQ import HuProverbRea_datasets
    # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
    # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
    # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
    # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
 datasets = HuProverbRea_datasets
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
--- a/examples/eval_OpenHuEval_HuProverbRea_OE.py
+++ b/examples/eval_OpenHuEval_HuProverbRea_OE.py
@ -0,0 +1,16 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
    # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
    # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
    # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
    # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
    # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
 datasets = HuProverbRea_datasets
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
--- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_2CQ.py
@ -0,0 +1,49 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDataset2CQ, HuProverb_Evaluator_2CQ
 with read_base():
    from .prompts import INSTRUCTIONS_DIRECT_QA
 # currently we use English prompts with hu proverbs inserted
 prompt_template_language = 'en'
 dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
 HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
                         output_column='out')
 HuProverbRea_datasets = []
 instruction = INSTRUCTIONS_DIRECT_QA[prompt_template_language]
 HuProverbRea_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt=instruction
                ),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_2CQ))
 HuProverbRea_datasets.append(
    dict(
        abbr=f'HuProverbRea_2CQ_{prompt_template_language}',
        type=HuProverbDataset2CQ,
        path=dataset_path,
        reader_cfg=HuProverbRea_reader_cfg,
        infer_cfg=HuProverbRea_infer_cfg,
        eval_cfg=HuProverbRea_eval_cfg,
    )
 )
--- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/HuProverbRea_OE.py
@ -0,0 +1,49 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.OpenHuEval.HuProverbRea import HuProverbDatasetOE, HuProverb_Evaluator_OE
 with read_base():
    from .prompts import INSTRUCTIONS_OE_DIR_QA
 # currently we use English prompts with hu proverbs inserted
 prompt_template_language = 'en'
 dataset_path = '/mnt/hwfile/opendatalab/gaojunyuan/shared_data/OpenHuEval/data/HuProverbRea/HuProverbRea_250127'
 HuProverbRea_reader_cfg = dict(input_columns=['hu_text', 'context', 'en_expl', 'hu_expl', 'option1', 'option2'],
                         output_column='out')
 HuProverbRea_datasets = []
 instruction = INSTRUCTIONS_OE_DIR_QA[prompt_template_language]
 HuProverbRea_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt=instruction
                ),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 HuProverbRea_eval_cfg = dict(evaluator=dict(type=HuProverb_Evaluator_OE))
 HuProverbRea_datasets.append(
    dict(
        abbr=f'HuProverbRea_OE_{prompt_template_language}',
        type=HuProverbDatasetOE,
        path=dataset_path,
        reader_cfg=HuProverbRea_reader_cfg,
        infer_cfg=HuProverbRea_infer_cfg,
        eval_cfg=HuProverbRea_eval_cfg,
    )
 )
--- a/opencompass/configs/datasets/OpenHuEval/HuProverbRea/prompts.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuProverbRea/prompts.py
@ -0,0 +1,70 @@
 INSTRUCTIONS_DIRECT_QA = {
    'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
    '######################\n' +
    'Hungarian Phrase:\n' +
    '----------------------\n' +
    "'{hu_text}'\n" +
    '######################\n\n' +
    'and a context using this phrase:\n\n' +
    '######################\n' +
    'Hungarian Context:\n' +
    '----------------------\n' +
    '{context}\n' +
    '######################\n\n' +
    'What does the person mean by using this phrase? Please select one correct answer from the following two options:\n\n' +
    '######################\n' +
    'Options:\n' +
    '----------------------\n' +
    'Option 1: {option1}\n' +
    'Option 2: {option2}\n' +
    '######################\n\n' +
    "You should only answer the option number, '1' or '2'. Do not output any other content other than the option number. Your answer:"
 }
 INSTRUCTIONS_OE_DIR_QA = {
    'en': 'You are a language expert specialized in Hungarian. Given a Hungarian phrase:\n\n' +
    '######################\n' +
    'Hungarian Phrase:\n' +
    '----------------------\n' +
    "'{hu_text}'\n" +
    '######################\n\n' +
    'and a context using this phrase:\n\n' +
    '######################\n' +
    'Hungarian Context:\n' +
    '----------------------\n' +
    '{context}\n' +
    '######################\n\n' +
    'What does the person mean by using this phrase? Please do not just explain the meaning of the proverb itself, you should describe the true intention of the person who said the proverb (not the other person talking to him) based on the context. Please answer concisely in one sentence:',
    'hu': 'Ön magyar nyelvi szakértő. Adott egy magyar kifejezés:\n\n' +
    '######################\n' +
    'Magyar kifejezés:\n' +
    '----------------------\n' +
    "'{hu_text}'\n" +
    '######################\n\n' +
    'és egy szövegkörnyezet, amely ezt a kifejezést használja:\n\n' +
    '######################\n' +
    'Magyar kontextus:\n' +
    '----------------------\n' +
    '{context}\n' +
    '######################\n\n' +
    'Mire gondol az illető, amikor ezt a kifejezést használja? Kérjük, ne csak magának a közmondásnak a jelentését magyarázza meg, hanem a szövegkörnyezet alapján írja le a közmondást kimondó személy (nem a vele beszélgető másik személy) valódi szándékát. Kérjük, válaszoljon tömören, egy mondatban:'
 }
 judge_prompt_template = {
    'en_system':
    "Please act as an impartial judge specialized in Hungarian language and culture. Given a Hungarian saying, a context using that saying, and two analyses explaining 'what does the person mean by using that saying in the context?', please decide whether the given two analyses express the same meaning. If they reflect the same understanding of the saying's meaning, you should answer YES. If they are based on different interpretations of the saying, you should answer NO. Do not output anything other than 'YES' or 'NO'. Avoid any position biases and ensure that the order in which the analyses were presented does not influence your decision. Do not allow the length of the analyses to influence your judge, focus on their core meanings and their understandings of the Hungarian saying.",
    'en_user':
    '[The start of Hungarian saying]\n' +
    '{proverb}\n' +
    '[The end of Hungarian saying]\n\n' +
    '[The start of the context]\n' +
    '{conversation}\n' +
    '[The end of the context]\n\n' +
    '[The start of the first analysis]\n' +
    '{answer}\n' +
    '[The end of the first analysis]\n\n' +
    '[The start of the second analysis]\n' +
    '{raw_pred}\n'+
    '[The end of the second analysis]\n\n' +
    'Your decision:'
 }
--- a/opencompass/datasets/OpenHuEval/HuProverbRea.py
+++ b/opencompass/datasets/OpenHuEval/HuProverbRea.py
@ -0,0 +1,295 @@
 import json
 import os
 from datasets import Dataset, DatasetDict
 from opencompass.configs.datasets.OpenHuEval.HuProverbRea.prompts import \
    judge_prompt_template
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.utils.prompt import PromptList
 from ..base import BaseDataset
 class HuProverbDataset2CQ(BaseDataset):
    @staticmethod
    def load(**kwargs):
        path = kwargs.get('path', None)
        dataset = DatasetDict()
        sub_dataset_file = os.path.join(path,
                                        '{}.jsonl'.format('HuProverbRea'))
        f = open(sub_dataset_file, 'r', encoding='utf-8')
        lines = f.readlines()
        out_dict_list = []
        for line in lines:
            obj = json.loads(line)
            if len(obj['context']) > 1:
                obj['context'] = '\n'.join(
                    [x.strip() for x in obj['context'] if x])
            else:
                obj['context'] = obj['context'][0]
            if obj['answer'] == 0:
                cor_ops = obj['options'][0]
                w_ops = obj['options'][1]
            else:
                cor_ops = obj['options'][1]
                w_ops = obj['options'][0]
            new_obj_1 = {
                'hu_text': obj['source_info']['proverb'],
                'context': obj['context'],
                'en_expl': obj['source_info']['en_expl'],
                'hu_expl': obj['source_info']['hu_expl'],
                'option1': cor_ops,
                'option2': w_ops,
                'out': {
                    'true_ans': '1',
                    'id': obj['qid'],
                    'source_id': obj['source_info']['source_id'],
                    'en_expl': obj['source_info']['en_expl'],
                    'en_trans': obj['source_info']['en_trans'],
                    'hu_expl': obj['source_info']['hu_expl'],
                    'hu_text': obj['source_info']['proverb'],
                    'context': obj['context'],
                    'option1': cor_ops,
                    'option2': w_ops,
                    'correct': cor_ops,
                    'incorrect': w_ops
                }
            }
            new_obj_2 = {
                'hu_text': obj['source_info']['proverb'],
                'context': obj['context'],
                'en_expl': obj['source_info']['en_expl'],
                'hu_expl': obj['source_info']['hu_expl'],
                'option1': w_ops,
                'option2': cor_ops,
                'out': {
                    'true_ans': '2',
                    'id': obj['qid'],
                    'source_id': obj['source_info']['source_id'],
                    'en_expl': obj['source_info']['en_expl'],
                    'en_trans': obj['source_info']['en_trans'],
                    'hu_expl': obj['source_info']['hu_expl'],
                    'hu_text': obj['source_info']['proverb'],
                    'context': obj['context'],
                    'option1': w_ops,
                    'option2': cor_ops,
                    'correct': cor_ops,
                    'incorrect': w_ops
                }
            }
            out_dict_list.append(new_obj_1)
            out_dict_list.append(new_obj_2)
        dataset = Dataset.from_list(out_dict_list)
        return dataset
 class HuProverbDatasetOE(BaseDataset):
    @staticmethod
    def load(**kwargs):
        path = kwargs.get('path', None)
        dataset = DatasetDict()
        sub_dataset_file = os.path.join(path,
                                        '{}.jsonl'.format('HuProverbRea'))
        f = open(sub_dataset_file, 'r', encoding='utf-8')
        lines = f.readlines()
        out_dict_list = []
        for line in lines:
            obj = json.loads(line)
            if len(obj['context']) > 1:
                obj['context'] = '\n'.join(
                    [x.strip() for x in obj['context'] if x])
            else:
                obj['context'] = obj['context'][0]
            if obj['answer'] == 0:
                cor_ops = obj['options'][0]
                w_ops = obj['options'][1]
            else:
                cor_ops = obj['options'][1]
                w_ops = obj['options'][0]
            new_obj = {
                'hu_text': obj['source_info']['proverb'],
                'context': obj['context'],
                'en_expl': obj['source_info']['en_expl'],
                'hu_expl': obj['source_info']['hu_expl'],
                'out': {
                    'id': obj['qid'],
                    'source_id': obj['source_info']['source_id'],
                    'en_expl': obj['source_info']['en_expl'],
                    'en_trans': obj['source_info']['en_trans'],
                    'hu_expl': obj['source_info']['hu_expl'],
                    'hu_text': obj['source_info']['proverb'],
                    'context': obj['context'],
                    'correct': cor_ops,
                    'incorrect': w_ops
                }
            }
            out_dict_list.append(new_obj)
        dataset = Dataset.from_list(out_dict_list)
        return dataset
 class HuProverb_Evaluator_2CQ(BaseEvaluator):
    """
    ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
    """
    def score(self, predictions, references, origin_prompt) -> dict:
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length.'}
        details = {}
        total, correct, incorrect, fail_parse = 0, 0, 0, 0
        for raw_pred, detail, ori_prompt in zip(predictions, references,
                                                origin_prompt):
            idx = detail['id']
            option1 = detail['option1']
            option2 = detail['option2']
            true_ans = detail['true_ans']
            res_of_this_round = {
                'origin_prompt': ori_prompt,
                'is_correct': False,
                'is_incorrect': False,
                'is_fail_parse': False,
                'option1': option1,
                'option2': option2,
                'true_ans': true_ans,
                'raw_pred': raw_pred
            }
            # parse ans from raw pred
            if '1' in raw_pred and '2' not in raw_pred:
                ans = '1'
            elif '2' in raw_pred and '1' not in raw_pred:
                ans = '2'
            else:
                ans = ''
            res_of_this_round['parsed_pred'] = ans
            if ans == true_ans:
                res_of_this_round['is_correct'] = True
            elif ans == '':
                res_of_this_round['is_fail_parse'] = True
            else:
                res_of_this_round['is_incorrect'] = True
            if idx not in details:
                total += 1
                details[idx] = {
                    'detail': {
                        'hu_text': detail['hu_text'],
                        'en_trans': detail['en_trans'],
                        'en_expl': detail['en_expl'],
                        'hu_expl': detail['hu_expl'],
                        'context': detail['context'],
                        'correct': detail['correct'],
                        'incorrect': detail['incorrect']
                    },
                    'flipped_variance': [res_of_this_round],
                    'is_correct': False,
                    'is_incorrect': False,
                    'is_fail_parse': False
                }
            else:
                details[idx]['flipped_variance'].append(res_of_this_round)
                # judge the results
                if details[idx]['flipped_variance'][0][
                        'is_correct'] and details[idx]['flipped_variance'][1][
                            'is_correct']:
                    correct += 1
                    details[idx]['is_correct'] = True
                elif details[idx]['flipped_variance'][0][
                        'is_fail_parse'] or details[idx]['flipped_variance'][
                            1]['is_fail_parse']:
                    fail_parse += 1
                    details[idx]['is_fail_parse'] = True
                else:
                    incorrect += 1
                    details[idx]['is_incorrect'] = True
        assert total == correct + incorrect + fail_parse
        results = {
            'correct_ratio': correct / total * 100,
            'incorrect_ratio': incorrect / total * 100,
            'fail_parse_ratio': fail_parse / total * 100,
            'details': details
        }
        return results
 class HuProverb_Evaluator_OE(BaseEvaluator):
    """
    ref: opencompass.openicl.icl_evaluator.AccwithDetailsEvaluator
    """
    def score(self, predictions, references, origin_prompt) -> dict:
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length.'}
        details = {}
        total, correct, wrong, unclear = 0, 0, 0, 0
        from opencompass.models import OpenAI
        model = OpenAI(path='gpt-4o',
                       max_seq_len=8192,
                       retry=2,
                       temperature=0.1)
        for raw_pred, detail in zip(predictions, references):
            total += 1
            qid = detail['id']
            details[qid] = {
                'proverb': detail['hu_text'],
                'conversation': detail['context'],
                'answer': detail['correct'],
                'raw_pred': raw_pred,
                'correctness': False,
                'ans_fail_parse': False
            }
            # ------------------------------------------- openai judge
            user_prompt = judge_prompt_template['en_user'].format(
                proverb=detail['hu_text'],
                conversation=detail['context'],
                answer=detail['correct'],
                raw_pred=raw_pred)
            system_prompt = judge_prompt_template['en_system']
            details[qid]['judge_user_prompt'] = user_prompt
            messages = PromptList([{
                'role': 'SYSTEM',
                'prompt': system_prompt,
            }, {
                'role': 'HUMAN',
                'prompt': user_prompt,
            }])
            response = model._generate(input=messages,
                                       max_out_len=8192,
                                       temperature=0.1)
            details[qid]['judge_resp'] = response
            if 'yes' in response.lower() and 'no' not in response.lower():
                correct += 1
                details[qid]['correctness'] = True
            elif 'no' in response.lower() and 'yes' not in response.lower():
                wrong += 1
            else:
                unclear += 1
                details[qid]['ans_fail_parse'] = True
        assert total == correct + wrong + unclear
        results = {
            'correct_ratio': correct / total * 100,
            'incorrect_ratio': wrong / total * 100,
            'ans_fail_parse_ratio': unclear / total * 100,
            'details': details
        }
        return results
--- a/opencompass/datasets/OpenHuEval/init.py
+++ b/opencompass/datasets/OpenHuEval/init.py
@ -1,2 +1,3 @@
 from .HuMatchingFIB import *  # noqa: F401, F403
 from .HuProverbRea import *  # noqa: F401, F403
 from .HuStandardFIB import *  # noqa: F401, F403