mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update humatching and eval details using data version 250205
This commit is contained in:
parent
8ec47e2b93
commit
9395dc2b60
@ -4,16 +4,17 @@ with read_base():
|
||||
from opencompass.configs.datasets.OpenHuEval.HuMatchingFIB.HuMatchingFIB import hu_matching_fib_datasets
|
||||
|
||||
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||
from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model
|
||||
# from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model
|
||||
# from opencompass.configs.models.deepseek.deepseek_v3_api import models as deepseek_v3_api_model
|
||||
|
||||
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||
|
||||
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||
from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model
|
||||
# from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model
|
||||
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat_model
|
||||
|
||||
datasets = hu_matching_fib_datasets
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
@ -34,6 +34,6 @@ The question and options are:
|
||||
'description': 'Version 2, using 1shot, more incontext, "#0#" as place holder, output in JSON format'
|
||||
}
|
||||
|
||||
OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval'
|
||||
DATA_VERSION = '250126'
|
||||
OpenHuEval_Path = '/mnt/hwfile/opendatalab/weixingjian/OpenHuEval'
|
||||
DATA_VERSION = '250205'
|
||||
DATA_PATH = f'{OpenHuEval_Path}/data/HuMatchingFIB/HuMatchingFIB_{DATA_VERSION}/HuMatchingFIB.jsonl'
|
||||
|
@ -61,10 +61,13 @@ class HuMatchingFIBEvaluator(BaseEvaluator):
|
||||
blank_total += len(std_ans)
|
||||
question_total += 1
|
||||
details[idx] = {
|
||||
'detail': refer,
|
||||
'reference': refer,
|
||||
'std_ans': std_ans,
|
||||
'model_ans': model_ans,
|
||||
'prompt': prompt,
|
||||
'raw_pred': pred,
|
||||
'blank_wise_correct': [False] * len(std_ans),
|
||||
'question_wise_correct': False,
|
||||
}
|
||||
continue
|
||||
json_str = json_str.strip()
|
||||
@ -89,27 +92,34 @@ class HuMatchingFIBEvaluator(BaseEvaluator):
|
||||
question_total += 1
|
||||
|
||||
model_ans = []
|
||||
blank_wise_correct = []
|
||||
is_question_correct = True
|
||||
if to_end_flag:
|
||||
model_ans = data.get('answer', [])
|
||||
is_question_correct = True
|
||||
for index, ans in enumerate(std_ans):
|
||||
if index >= len(model_ans):
|
||||
is_question_correct = False
|
||||
break
|
||||
blank_wise_correct.append(False)
|
||||
continue
|
||||
if ans == model_ans[index]:
|
||||
blank_correct += 1
|
||||
blank_wise_correct.append(True)
|
||||
else:
|
||||
is_question_correct = False
|
||||
blank_wise_correct.append(False)
|
||||
|
||||
blank_total += len(std_ans)
|
||||
question_total += 1
|
||||
question_correct += 1 if is_question_correct else 0
|
||||
|
||||
details[idx] = {
|
||||
'detail': refer,
|
||||
'reference': refer,
|
||||
'std_ans': std_ans,
|
||||
'model_ans': model_ans,
|
||||
'prompt': prompt,
|
||||
'raw_pred': pred,
|
||||
'blank_wise_correct': blank_wise_correct,
|
||||
'question_wise_correct': is_question_correct,
|
||||
}
|
||||
results = {
|
||||
'blank_level_correctness':
|
||||
|
Loading…
Reference in New Issue
Block a user