mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Fix] fix ifeval (#909)
This commit is contained in:
parent
45c606bcd0
commit
53fe788d27
@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \
|
||||
## Evaluation results
|
||||
|
||||
```
|
||||
dataset version metric mode qwen-72b-chat-hf mistral-7b-instruct-v0.2-hf mixtral-8x7b-instruct-v0.1 chatglm3-6b-hf
|
||||
--------- --------- ---------- ------ ------------------ ----------------------------- ---------------------------- ----------------
|
||||
IFEval 27a9cc strict_acc gen 43.62 49.17 48.98 29.76
|
||||
IFEval 27a9cc loose_acc gen 45.47 53.97 54.71 32.16
|
||||
dataset version metric mode baichuan2-7b-chat-hf baichuan2-13b-chat-hf internlm2-chat-7b-hf internlm2-chat-20b-hf llama-2-7b-chat-hf llama-2-13b-chat-hf
|
||||
--------- --------- ---------------------------- ------ ---------------------- ----------------------- ---------------------- ----------------------- -------------------- ---------------------
|
||||
IFEval 3321a3 Prompt-level-strict-accuracy gen 36.04 35.49 38.26 33.09 33.46 33.64
|
||||
IFEval 3321a3 Inst-level-strict-accuracy gen 46.76 46.76 49.16 45.32 45.68 45.44
|
||||
IFEval 3321a3 Prompt-level-loose-accuracy gen 37.52 37.71 42.51 39.37 43.81 47.32
|
||||
IFEval 3321a3 Inst-level-loose-accuracy gen 48.44 49.16 53.72 51.08 55.64 58.03
|
||||
```
|
||||
|
||||
## Reference
|
||||
|
@ -27,7 +27,9 @@ class IFEvalDataset(BaseDataset):
|
||||
class IFEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
results = []
|
||||
results = dict()
|
||||
for metric in ('strict', 'loose'):
|
||||
results[metric] = []
|
||||
for pred, refer in zip(predictions, references):
|
||||
input = InputExample(
|
||||
key=refer['key'],
|
||||
@ -38,15 +40,29 @@ class IFEvaluator(BaseEvaluator):
|
||||
for k in list(kwarg.keys()):
|
||||
if kwarg[k] is None:
|
||||
kwarg.pop(k, None)
|
||||
result = dict(
|
||||
strict=test_instruction_following_strict(input, pred),
|
||||
loose=test_instruction_following_loose(input, pred),
|
||||
)
|
||||
results.append(result)
|
||||
strict = sum(
|
||||
[result['strict'].follow_all_instructions
|
||||
for result in results]) / len(results)
|
||||
loose = sum(
|
||||
[result['loose'].follow_all_instructions
|
||||
for result in results]) / len(results)
|
||||
return dict(strict_acc=strict * 100, loose_acc=loose * 100)
|
||||
results['strict'].append(
|
||||
test_instruction_following_strict(input, pred))
|
||||
results['loose'].append(
|
||||
test_instruction_following_loose(input, pred))
|
||||
final_scores = dict()
|
||||
for metric in ('strict', 'loose'):
|
||||
prompt_total = 0
|
||||
prompt_correct = 0
|
||||
inst_total = 0
|
||||
inst_correct = 0
|
||||
|
||||
for example in results[metric]:
|
||||
follow_instruction_list = example.follow_instruction_list
|
||||
instruction_id_list = example.instruction_id_list
|
||||
|
||||
prompt_total += 1
|
||||
if all(follow_instruction_list):
|
||||
prompt_correct += 1
|
||||
|
||||
inst_total += len(instruction_id_list)
|
||||
inst_correct += sum(follow_instruction_list)
|
||||
prompt_score = f'Prompt-level-{metric}-accuracy'
|
||||
inst_score = f'Inst-level-{metric}-accuracy'
|
||||
final_scores[prompt_score] = prompt_correct / prompt_total * 100
|
||||
final_scores[inst_score] = inst_correct / inst_total * 100
|
||||
return final_scores
|
||||
|
Loading…
Reference in New Issue
Block a user