update HuMatchingFIB config and dataset

2025-05-30 16:03:24 +08:00 · 2025-01-25 23:12:38 +08:00 · 2025-01-25 23:12:38 +08:00 · d4df622e02
commit d4df622e02
parent 116a24632c
5 changed files with 70 additions and 71 deletions
--- a/examples/eval_OpenHuEval_HuMatchingFIB.py
+++ b/examples/eval_OpenHuEval_HuMatchingFIB.py
@ -0,0 +1,13 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.OpenHuEval.HuMatchingFIB.HuMatchingFIB import hu_matching_fib_datasets
+
+    from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
+    # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
+
+datasets = hu_matching_fib_datasets
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
--- a/examples/eval_fib1.py
+++ b/examples/eval_fib1.py
@ -1,11 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    from opencompass.configs.datasets.OpenHuEval.HuMatchingFIB.HuMatchingFIB import FIB1_datasets
-
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat_model
-
-
-datasets = FIB1_datasets
-models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
-work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
--- a/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB.py
@ -6,45 +6,40 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.OpenHuEval.HuMatchingFIB import HuMatchingFIBDataset, HuMatchingFIBEvaluator

 with read_base():
-    from .HuMatchingFIB_setting import INSTRUCTIONS, DATASET_PATH
+    from .HuMatchingFIB_setting import INSTRUCTION, DATA_PATH, DATA_VERSION

-ALL_LANGUAGES = ['hu']
-PROMPT_VERSION = INSTRUCTIONS['version']

-FIB1_reader_cfg = dict(input_columns=['question', 'subject'],
-                         output_column='reference')
+instruction = INSTRUCTION['prompt_template']
+prompt_version = INSTRUCTION['version']

-FIB1_datasets = []
-for lan in ALL_LANGUAGES:
-    instruction = INSTRUCTIONS[lan]
-    FIB1_infer_cfg = dict(
-        prompt_template=dict(
-            type=PromptTemplate,
-            template=dict(
-                begin='</E>',
-                round=[
-                    dict(
-                        role='HUMAN',
-                        prompt=instruction
-                    ),
-                ],
-            ),
-            ice_token='</E>',
+hu_matching_fib_reader_cfg = dict(input_columns=['question', 'subject'],
+                                  output_column='reference')
+
+hu_matching_fib_datasets = []
+
+hu_matching_fib_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt=instruction),
+            ],
        ),
-        retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer),
-    )
+        ice_token='</E>',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)

-    FIB1_eval_cfg = dict(evaluator=dict(type=HuMatchingFIBEvaluator))
+hu_matching_fib_eval_cfg = dict(evaluator=dict(type=HuMatchingFIBEvaluator))

-    FIB1_datasets.append(
-        dict(
-            abbr=f'nkp_FIB1_humanities-{lan}-1shot-{PROMPT_VERSION}',
-            type=HuMatchingFIBDataset,
-            path=DATASET_PATH,
-            lan=lan,
-            reader_cfg=FIB1_reader_cfg,
-            infer_cfg=FIB1_infer_cfg,
-            eval_cfg=FIB1_eval_cfg,
-        )
-    )
+hu_matching_fib_datasets.append(
+    dict(
+        abbr=f'hu_matching_fib_{DATA_VERSION}-prompt_{prompt_version}',
+        type=HuMatchingFIBDataset,
+        filepath=DATA_PATH,
+        reader_cfg=hu_matching_fib_reader_cfg,
+        infer_cfg=hu_matching_fib_infer_cfg,
+        eval_cfg=hu_matching_fib_eval_cfg,
+    ))
--- a/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py
@ -13,22 +13,23 @@
 #     'description': 'Initial version, using 1shot, incontext, #0# as place holder, output in JSON format',
 # }

-INSTRUCTIONS = {
-    'hu': """
-    You are a native hungarian teacher. The following question is in hungarian language on {subject}. Please read the question, and You need to choose the appropriate option from the provided "option" list to fill in each blanks in the text based on the context. Read the entire text, then fill in the blanks. Some options can be selected repeatedly. Please organize the answer in a list. An example:
-    {
-        "q_main": "Egészítsd ki a Janus Pannonius életére vonatkozó rövid szöveget! Segítségként használd az internetet! Vigyázz, nem minden szót kell felhasználnod!\nJanus Pannonius nem csupán költőként volt jelentős személyisége kora Magyarországának. #0# unokaöccseként a politikából is hamar kivette a részét. #1# tanulmányai után pécsi #2# lett, majd a királyné mellett #3#. Főkincstartóként és a #4# báni cím elnyerésével komoly politikai karriert futott be Mátyás király udvarában. A királlyal megromló kapcsolata miatt részt vett a #5# elleni összeesküvésben, ezért menekülnie kellett. Ez, és az akkor már súlyosbodó betegsége okozta halálát #6#.",
-        "options": ["A.érsek", "B.szlavón", "C.Vitéz János", "D.püspök", "E.főpohárnok", "F.Ulászló", "G.1474-ben", "H.főkancellár", "I.Itáliai", "J.Kinizsi Pál", "K.Kálmán", "L.1472-ben", "M.Prágai", "N.Mátyás"],
-    },
-    The answer is:
-    {
-        "std_ans": ["#0#C", "#1#I", "#2#D", "#3#H", "#4#B", "#5#N", "#6#L"]
-    }
-    Now try to answer the following question, your response should be in a JSON format. Contain the std_ans like the case given above.
-    The question is: {question}.
-    """,
+INSTRUCTION = {
+    'prompt_template': """You are a native hungarian teacher. The following question is in Hungarian language on {subject}. Please read the question, and You need to choose the appropriate option from the provided "option" list to fill in each blanks in the text based on the context. Read the entire text, then fill in the blanks. Some options can be selected repeatedly. Please organize the answer in a list. An example:
+{
+    "q_main": "Egészítsd ki a Janus Pannonius életére vonatkozó rövid szöveget! Segítségként használd az internetet! Vigyázz, nem minden szót kell felhasználnod!\nJanus Pannonius nem csupán költőként volt jelentős személyisége kora Magyarországának. #0# unokaöccseként a politikából is hamar kivette a részét. #1# tanulmányai után pécsi #2# lett, majd a királyné mellett #3#. Főkincstartóként és a #4# báni cím elnyerésével komoly politikai karriert futott be Mátyás király udvarában. A királlyal megromló kapcsolata miatt részt vett a #5# elleni összeesküvésben, ezért menekülnie kellett. Ez, és az akkor már súlyosbodó betegsége okozta halálát #6#.",
+    "options": ["A.érsek", "B.szlavón", "C.Vitéz János", "D.püspök", "E.főpohárnok", "F.Ulászló", "G.1474-ben", "H.főkancellár", "I.Itáliai", "J.Kinizsi Pál", "K.Kálmán", "L.1472-ben", "M.Prágai", "N.Mátyás"],
+},
+The answer is:
+{
+    "std_ans": ["#0#C", "#1#I", "#2#D", "#3#H", "#4#B", "#5#N", "#6#L"]
+}
+Now try to answer the following question, your response should be in a JSON format. Contain the std_ans like the case given above.
+The question is: {question}.
+""",
    'version':'V2',
    'description': 'Version 2, using 1shot, more incontext, "#0#" as place holder, output in JSON format'
 }

-DATASET_PATH = "/mnt/hwfile/opendatalab/weixingjian/test/"
+OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval'
+DATA_VERSION = '250123'
+DATA_PATH = f'{OpenHuEval_Path}/data/HuMatchingFIB/HuMatchingFIB_{DATA_VERSION}/HuMatchingFIB.jsonl'
--- a/opencompass/datasets/OpenHuEval/HuMatchingFIB.py
+++ b/opencompass/datasets/OpenHuEval/HuMatchingFIB.py
@ -3,21 +3,20 @@ import os
 import re

 from datasets import Dataset, DatasetDict
+
 from opencompass.openicl.icl_evaluator import BaseEvaluator
+
 from ..base import BaseDataset


 class HuMatchingFIBDataset(BaseDataset):

    @staticmethod
-    def load(**kwargs):
-        path = kwargs.get('path', None)
-        # lan = kwargs.get('lan', None)
+    def load(filepath):
+        assert os.path.isfile(filepath)
+        assert filepath.endswith('.jsonl')
        dataset = DatasetDict()
-        file_list = [os.path.join(path, file) for file in os.listdir(path)
-                     ]  # TODO only work for a single split.
-        f_path = file_list[0]
-        f = open(f_path, 'r', encoding='utf-8')
+        f = open(filepath, 'r', encoding='utf-8')
        lines = f.readlines()
        objs = []
        for line in lines:
@ -26,9 +25,11 @@ class HuMatchingFIBDataset(BaseDataset):
        out_dict_list = []
        for obj in objs:
            question = dict(q_main=obj['q_main'], options=obj['options'])
-            subject = obj['major']
+            hu_specific_dim = obj['hu_specific_label_question']
            tmp = obj
-            new_obj = dict(question=question, subject=subject, reference=tmp)
+            new_obj = dict(question=question,
+                           hu_specific_dim=hu_specific_dim,
+                           reference=tmp)
            out_dict_list.append(new_obj)
        dataset = Dataset.from_list(out_dict_list)
        return dataset