diff --git a/examples/eval_OpenHuEval_HuMatchingFIB.py b/examples/eval_OpenHuEval_HuMatchingFIB.py index ab48c4da..b5c3347f 100644 --- a/examples/eval_OpenHuEval_HuMatchingFIB.py +++ b/examples/eval_OpenHuEval_HuMatchingFIB.py @@ -4,16 +4,17 @@ with read_base(): from opencompass.configs.datasets.OpenHuEval.HuMatchingFIB.HuMatchingFIB import hu_matching_fib_datasets # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model - from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model + # from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model # from opencompass.configs.models.deepseek.deepseek_v3_api import models as deepseek_v3_api_model # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model # from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model - from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model + # from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat_model datasets = hu_matching_fib_datasets models = sum([v for k, v in locals().items() if k.endswith('_model')], []) diff --git a/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py b/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py index 481cc85c..39b2508d 100644 --- a/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py +++ b/opencompass/configs/datasets/OpenHuEval/HuMatchingFIB/HuMatchingFIB_setting.py @@ -34,6 +34,6 @@ The question and options are: 'description': 'Version 2, using 1shot, more incontext, "#0#" as place holder, output in JSON format' } -OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval' -DATA_VERSION = '250126' +OpenHuEval_Path = '/mnt/hwfile/opendatalab/weixingjian/OpenHuEval' +DATA_VERSION = '250205' DATA_PATH = f'{OpenHuEval_Path}/data/HuMatchingFIB/HuMatchingFIB_{DATA_VERSION}/HuMatchingFIB.jsonl' diff --git a/opencompass/datasets/OpenHuEval/HuMatchingFIB.py b/opencompass/datasets/OpenHuEval/HuMatchingFIB.py index c8bf189d..f6ca6ad5 100644 --- a/opencompass/datasets/OpenHuEval/HuMatchingFIB.py +++ b/opencompass/datasets/OpenHuEval/HuMatchingFIB.py @@ -61,10 +61,13 @@ class HuMatchingFIBEvaluator(BaseEvaluator): blank_total += len(std_ans) question_total += 1 details[idx] = { - 'detail': refer, + 'reference': refer, + 'std_ans': std_ans, 'model_ans': model_ans, 'prompt': prompt, 'raw_pred': pred, + 'blank_wise_correct': [False] * len(std_ans), + 'question_wise_correct': False, } continue json_str = json_str.strip() @@ -89,27 +92,34 @@ class HuMatchingFIBEvaluator(BaseEvaluator): question_total += 1 model_ans = [] + blank_wise_correct = [] + is_question_correct = True if to_end_flag: model_ans = data.get('answer', []) - is_question_correct = True for index, ans in enumerate(std_ans): if index >= len(model_ans): is_question_correct = False - break + blank_wise_correct.append(False) + continue if ans == model_ans[index]: blank_correct += 1 + blank_wise_correct.append(True) else: is_question_correct = False + blank_wise_correct.append(False) blank_total += len(std_ans) question_total += 1 question_correct += 1 if is_question_correct else 0 details[idx] = { - 'detail': refer, + 'reference': refer, + 'std_ans': std_ans, 'model_ans': model_ans, 'prompt': prompt, 'raw_pred': pred, + 'blank_wise_correct': blank_wise_correct, + 'question_wise_correct': is_question_correct, } results = { 'blank_level_correctness':