mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge remote-tracking branch 'upstream/main' into needlebench
This commit is contained in:
commit
af82a7f085
@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \
|
|||||||
## Evaluation results
|
## Evaluation results
|
||||||
|
|
||||||
```
|
```
|
||||||
dataset version metric mode qwen-72b-chat-hf mistral-7b-instruct-v0.2-hf mixtral-8x7b-instruct-v0.1 chatglm3-6b-hf
|
dataset version metric mode baichuan2-7b-chat-hf baichuan2-13b-chat-hf internlm2-chat-7b-hf internlm2-chat-20b-hf llama-2-7b-chat-hf llama-2-13b-chat-hf
|
||||||
--------- --------- ---------- ------ ------------------ ----------------------------- ---------------------------- ----------------
|
--------- --------- ---------------------------- ------ ---------------------- ----------------------- ---------------------- ----------------------- -------------------- ---------------------
|
||||||
IFEval 27a9cc strict_acc gen 43.62 49.17 48.98 29.76
|
IFEval 3321a3 Prompt-level-strict-accuracy gen 36.04 35.49 38.26 33.09 33.46 33.64
|
||||||
IFEval 27a9cc loose_acc gen 45.47 53.97 54.71 32.16
|
IFEval 3321a3 Inst-level-strict-accuracy gen 46.76 46.76 49.16 45.32 45.68 45.44
|
||||||
|
IFEval 3321a3 Prompt-level-loose-accuracy gen 37.52 37.71 42.51 39.37 43.81 47.32
|
||||||
|
IFEval 3321a3 Inst-level-loose-accuracy gen 48.44 49.16 53.72 51.08 55.64 58.03
|
||||||
```
|
```
|
||||||
|
|
||||||
## Reference
|
## Reference
|
||||||
|
@ -27,7 +27,9 @@ class IFEvalDataset(BaseDataset):
|
|||||||
class IFEvaluator(BaseEvaluator):
|
class IFEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
def score(self, predictions, references):
|
def score(self, predictions, references):
|
||||||
results = []
|
results = dict()
|
||||||
|
for metric in ('strict', 'loose'):
|
||||||
|
results[metric] = []
|
||||||
for pred, refer in zip(predictions, references):
|
for pred, refer in zip(predictions, references):
|
||||||
input = InputExample(
|
input = InputExample(
|
||||||
key=refer['key'],
|
key=refer['key'],
|
||||||
@ -38,15 +40,29 @@ class IFEvaluator(BaseEvaluator):
|
|||||||
for k in list(kwarg.keys()):
|
for k in list(kwarg.keys()):
|
||||||
if kwarg[k] is None:
|
if kwarg[k] is None:
|
||||||
kwarg.pop(k, None)
|
kwarg.pop(k, None)
|
||||||
result = dict(
|
results['strict'].append(
|
||||||
strict=test_instruction_following_strict(input, pred),
|
test_instruction_following_strict(input, pred))
|
||||||
loose=test_instruction_following_loose(input, pred),
|
results['loose'].append(
|
||||||
)
|
test_instruction_following_loose(input, pred))
|
||||||
results.append(result)
|
final_scores = dict()
|
||||||
strict = sum(
|
for metric in ('strict', 'loose'):
|
||||||
[result['strict'].follow_all_instructions
|
prompt_total = 0
|
||||||
for result in results]) / len(results)
|
prompt_correct = 0
|
||||||
loose = sum(
|
inst_total = 0
|
||||||
[result['loose'].follow_all_instructions
|
inst_correct = 0
|
||||||
for result in results]) / len(results)
|
|
||||||
return dict(strict_acc=strict * 100, loose_acc=loose * 100)
|
for example in results[metric]:
|
||||||
|
follow_instruction_list = example.follow_instruction_list
|
||||||
|
instruction_id_list = example.instruction_id_list
|
||||||
|
|
||||||
|
prompt_total += 1
|
||||||
|
if all(follow_instruction_list):
|
||||||
|
prompt_correct += 1
|
||||||
|
|
||||||
|
inst_total += len(instruction_id_list)
|
||||||
|
inst_correct += sum(follow_instruction_list)
|
||||||
|
prompt_score = f'Prompt-level-{metric}-accuracy'
|
||||||
|
inst_score = f'Inst-level-{metric}-accuracy'
|
||||||
|
final_scores[prompt_score] = prompt_correct / prompt_total * 100
|
||||||
|
final_scores[inst_score] = inst_correct / inst_total * 100
|
||||||
|
return final_scores
|
||||||
|
Loading…
Reference in New Issue
Block a user