[Fix] Fix wildbench (#1508)

* fix pip version

* fix pip version

* fix_wildbench
This commit is contained in:
bittersweet1999 2024-09-10 17:35:07 +08:00 committed by GitHub
parent a31a77c5c1
commit c2bcd8725e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 15 additions and 9 deletions

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict( subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'], input_columns=['dialogue', 'prompt'],
@ -61,5 +61,6 @@ wildbench_datasets.append(
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}], {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战 mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random', infer_order='random',
base_models = [llama_2_70b, gpt4, claude] base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
)) ))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict( subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'], input_columns=['dialogue', 'prompt'],
@ -61,5 +61,6 @@ wildbench_datasets.append(
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}], {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战 mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random', infer_order='random',
base_models = [llama_2_70b, gpt4, claude] base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
)) ))

View File

@ -156,8 +156,8 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
self.tasks = [] self.tasks = []
self.cfg = config self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None) self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
@ -247,8 +247,10 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
scores = self.get_score(time_str) scores = self.get_score(time_str)
all_scores = {}
output_dir, results_folder = get_outdir(self.cfg, time_str) output_dir, results_folder = get_outdir(self.cfg, time_str)
for idx, judge_model in enumerate(self.judge_models): for idx, judge_model in enumerate(self.judge_models):
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model) judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']: for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
@ -258,7 +260,7 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
row_headers = [dataset_abbr, 'position_bias'] + row_headers row_headers = [dataset_abbr, 'position_bias'] + row_headers
table = [] table = []
for row_header in row_headers: for idx, row_header in enumerate(row_headers):
row = [row_header] row = [row_header]
headers = [''] headers = ['']
for model_cfg in self.compare_models: for model_cfg in self.compare_models:
@ -276,12 +278,13 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
s = str(s) s = str(s)
row.append(s) row.append(s)
avg = avg/len(self.base_models) avg = avg/len(self.base_models)
if idx == 0:
score_by_judgemodel[model_abbr] = {'score': avg}
row.append(f'{avg:.2f}') row.append(f'{avg:.2f}')
headers.append('Avg') headers.append('Avg')
table.append(row) table.append(row)
txt = tabulate(table, headers=headers) txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models): if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
@ -292,4 +295,5 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
f.write(','.join(headers) + '\n') f.write(','.join(headers) + '\n')
for line in table: for line in table:
f.write(','.join(line) + '\n') f.write(','.join(line) + '\n')
print(output_filename) all_scores[judge_abbr] = score_by_judgemodel
return {'Wildbench': all_scores}