mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update hustandard and eval details using data version 250205
This commit is contained in:
parent
9395dc2b60
commit
9ae714a577
@ -3,10 +3,11 @@ from mmengine.config import read_base
|
|||||||
with read_base():
|
with read_base():
|
||||||
from opencompass.configs.datasets.OpenHuEval.HuStandardFIB.HuStandardFIB import hu_standard_fib_datasets
|
from opencompass.configs.datasets.OpenHuEval.HuStandardFIB.HuStandardFIB import hu_standard_fib_datasets
|
||||||
|
|
||||||
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||||
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
# from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat_model
|
||||||
|
|
||||||
datasets = hu_standard_fib_datasets
|
datasets = hu_standard_fib_datasets
|
||||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
@ -22,6 +22,6 @@ The questions are:
|
|||||||
'Initial version, using 1shot, incontext, #0# as place holder, output in JSON format',
|
'Initial version, using 1shot, incontext, #0# as place holder, output in JSON format',
|
||||||
}
|
}
|
||||||
|
|
||||||
OpenHuEval_Path = '/mnt/hwfile/opendatalab/wj/proj/polyglot_24July/OpenHuEval'
|
OpenHuEval_Path = '/mnt/hwfile/opendatalab/weixingjian/OpenHuEval'
|
||||||
DATA_VERSION = '250126'
|
DATA_VERSION = '250205'
|
||||||
DATA_PATH = f'{OpenHuEval_Path}/data/HuStandardFIB/HuStandardFIB_{DATA_VERSION}/HuStandardFIB.jsonl'
|
DATA_PATH = f'{OpenHuEval_Path}/data/HuStandardFIB/HuStandardFIB_{DATA_VERSION}/HuStandardFIB.jsonl'
|
||||||
|
@ -28,9 +28,8 @@ class HuStandardFIBDataset(BaseDataset):
|
|||||||
out_dict_list = []
|
out_dict_list = []
|
||||||
|
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
instruction = obj['question'] # TODO: question -> instruction
|
instruction = obj['instruction']
|
||||||
questions = obj[
|
questions = obj['questions']
|
||||||
'question_sub'] # TODO: update question_sub -> questions
|
|
||||||
hu_specific_dim = obj['hu_specific_dim']
|
hu_specific_dim = obj['hu_specific_dim']
|
||||||
tmp = obj
|
tmp = obj
|
||||||
new_obj = dict(instruction=instruction,
|
new_obj = dict(instruction=instruction,
|
||||||
@ -54,11 +53,11 @@ class HuStandardFIBEvaluator(BaseEvaluator):
|
|||||||
blank_correct, blank_total = 0, 0
|
blank_correct, blank_total = 0, 0
|
||||||
question_correct, question_total = 0, 0
|
question_correct, question_total = 0, 0
|
||||||
|
|
||||||
for idx, (pred, refer, prompt) in enumerate(
|
for i, (pred, refer, prompt) in enumerate(
|
||||||
zip(predictions, references, origin_prompt)):
|
zip(predictions, references, origin_prompt)):
|
||||||
std_ans = [
|
std_ans = [
|
||||||
re.sub(r'#\d+#', '', ans).split(';')
|
re.sub(r'#\d+#', '', ans).split(';')
|
||||||
for ans in refer['answer'] # TODO: answer -> answers
|
for ans in refer['answers']
|
||||||
] # Remove "#0#" and "#1#", then split refer['formatted_std_ans']
|
] # Remove "#0#" and "#1#", then split refer['formatted_std_ans']
|
||||||
model_ans = []
|
model_ans = []
|
||||||
pred = pred.strip()
|
pred = pred.strip()
|
||||||
@ -68,12 +67,14 @@ class HuStandardFIBEvaluator(BaseEvaluator):
|
|||||||
else:
|
else:
|
||||||
blank_total += len(std_ans)
|
blank_total += len(std_ans)
|
||||||
question_total += 1
|
question_total += 1
|
||||||
details[idx] = {
|
details[i] = {
|
||||||
'detail': refer,
|
'reference': refer,
|
||||||
'model_ans': model_ans,
|
'model_ans': model_ans,
|
||||||
'gt': std_ans,
|
'gt': std_ans,
|
||||||
'prompt': prompt,
|
'prompt': prompt,
|
||||||
'raw_pred': pred,
|
'raw_pred': pred,
|
||||||
|
'blank_wise_correctness': [False] * len(std_ans),
|
||||||
|
'question_wise_correctness': False,
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
json_str = json_str.strip()
|
json_str = json_str.strip()
|
||||||
@ -86,7 +87,7 @@ class HuStandardFIBEvaluator(BaseEvaluator):
|
|||||||
data = json.loads(formatted_json_str)
|
data = json.loads(formatted_json_str)
|
||||||
to_end_flag = True
|
to_end_flag = True
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f'Invalid JSON format. {idx}')
|
print(f'Invalid JSON format. {i}')
|
||||||
blank_total += len(std_ans)
|
blank_total += len(std_ans)
|
||||||
question_total += 1
|
question_total += 1
|
||||||
|
|
||||||
@ -98,18 +99,21 @@ class HuStandardFIBEvaluator(BaseEvaluator):
|
|||||||
question_total += 1
|
question_total += 1
|
||||||
|
|
||||||
model_ans = []
|
model_ans = []
|
||||||
|
blank_wise_correctness = []
|
||||||
|
is_question_correct = True
|
||||||
if to_end_flag:
|
if to_end_flag:
|
||||||
model_ans = [
|
model_ans = [
|
||||||
re.sub(r'#\d+#', '', ans).split(';')
|
re.sub(r'#\d+#', '', ans).split(';')
|
||||||
for ans in data.get('answers', [])
|
for ans in data.get('answers', [])
|
||||||
] # Preprocess model_ans in the same way as std_ans
|
] # Preprocess model_ans in the same way as std_ans
|
||||||
is_question_correct = True
|
|
||||||
for idx, ans_list in enumerate(std_ans):
|
for idx, ans_list in enumerate(std_ans):
|
||||||
if idx >= len(model_ans):
|
if idx >= len(model_ans):
|
||||||
is_question_correct = False
|
is_question_correct = False
|
||||||
break
|
blank_wise_correctness.append(False)
|
||||||
|
continue
|
||||||
|
|
||||||
model_list = model_ans[idx]
|
model_list = model_ans[idx]
|
||||||
|
is_blank_correct = True
|
||||||
for ans in ans_list:
|
for ans in ans_list:
|
||||||
best_match = max(
|
best_match = max(
|
||||||
model_list,
|
model_list,
|
||||||
@ -117,18 +121,22 @@ class HuStandardFIBEvaluator(BaseEvaluator):
|
|||||||
if fuzz.ratio(ans, best_match) > 70: # check threshold
|
if fuzz.ratio(ans, best_match) > 70: # check threshold
|
||||||
blank_correct += 1
|
blank_correct += 1
|
||||||
else:
|
else:
|
||||||
|
is_blank_correct = False
|
||||||
is_question_correct = False
|
is_question_correct = False
|
||||||
|
blank_wise_correctness.append(is_blank_correct)
|
||||||
|
|
||||||
blank_total += len(std_ans)
|
blank_total += len(std_ans)
|
||||||
question_total += 1
|
question_total += 1
|
||||||
question_correct += 1 if is_question_correct else 0
|
question_correct += 1 if is_question_correct else 0
|
||||||
|
|
||||||
details[idx] = {
|
details[i] = {
|
||||||
'detail': refer,
|
'reference': refer,
|
||||||
|
'std_ans': std_ans,
|
||||||
'model_ans': model_ans,
|
'model_ans': model_ans,
|
||||||
'gt': std_ans,
|
|
||||||
'prompt': prompt,
|
'prompt': prompt,
|
||||||
'raw_pred': pred,
|
'raw_pred': pred,
|
||||||
|
'blank_wise_correctness': blank_wise_correctness,
|
||||||
|
'question_wise_correctness': is_question_correct,
|
||||||
}
|
}
|
||||||
results = {
|
results = {
|
||||||
'blank_level_correctness':
|
'blank_level_correctness':
|
||||||
|
Loading…
Reference in New Issue
Block a user