mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* add rewardbench * add rewardbench * add rmb datasets * add rmb datasets * add judgebench * add judgebench
54 lines
2.7 KiB
Python
54 lines
2.7 KiB
Python
RewardBench_summary_groups = []
|
|
|
|
_Chat_weights = {
|
|
'alpacaeval-easy': 0.32355305466237944,
|
|
'alpacaeval-length': 0.32355305466237944,
|
|
'alpacaeval-hard': 0.32355305466237944,
|
|
'mt-bench-easy': 0.011254019292604502,
|
|
'mt-bench-med': 0.018086816720257234,
|
|
}
|
|
|
|
_Chat_Hard_weights = {
|
|
'mt-bench-hard': 0.09698275862068965,
|
|
'llmbar-natural': 0.21551724137931033,
|
|
'llmbar-adver-neighbor': 0.28879310344827586,
|
|
'llmbar-adver-GPTInst': 0.19827586206896552,
|
|
'llmbar-adver-GPTOut': 0.10129310344827586,
|
|
'llmbar-adver-manual': 0.09913793103448276,
|
|
}
|
|
|
|
_Safety_weights = {
|
|
'refusals-dangerous': 0.13513513513513514,
|
|
'refusals-offensive': 0.13513513513513514,
|
|
'xstest-should-refuse': 0.20810810810810812,
|
|
'xstest-should-respond': 0.33783783783783783,
|
|
'donotanswer': 0.1837837837837838,
|
|
}
|
|
|
|
_Reasoning_weights = {
|
|
'math-prm': 0.31236897274633124,
|
|
'hep-cpp': 0.1146051712089448,
|
|
'hep-go': 0.1146051712089448,
|
|
'hep-java': 0.1146051712089448,
|
|
'hep-js': 0.1146051712089448,
|
|
'hep-python': 0.1146051712089448,
|
|
'hep-rust': 0.1146051712089448,
|
|
}
|
|
|
|
_RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
|
|
RewardBench_summary_groups.append({'name': 'Chat', 'subsets': list(_Chat_weights.keys()), 'weights': _Chat_weights})
|
|
RewardBench_summary_groups.append({'name': 'Chat Hard', 'subsets': list(_Chat_Hard_weights.keys()), 'weights': _Chat_Hard_weights})
|
|
RewardBench_summary_groups.append({'name': 'Safety', 'subsets': list(_Safety_weights.keys()), 'weights': _Safety_weights})
|
|
RewardBench_summary_groups.append({'name': 'Reasoning', 'subsets': list(_Reasoning_weights.keys()), 'weights': _Reasoning_weights})
|
|
RewardBench_summary_groups.append({'name': 'RewardBench', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
|
|
|
|
summarizer = dict(
|
|
dataset_abbrs=[
|
|
'Chat',
|
|
'Chat Hard',
|
|
'Safety',
|
|
'Reasoning',
|
|
'RewardBench'
|
|
],
|
|
summary_groups=RewardBench_summary_groups,
|
|
) |