From c2bcd8725e615ec455bf5b7301f8d09962cd64e3 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Tue, 10 Sep 2024 17:35:07 +0800 Subject: [PATCH] [Fix] Fix wildbench (#1508) * fix pip version * fix pip version * fix_wildbench --- .../subjective/wildbench/wildbench_pair_judge.py | 5 +++-- .../subjective/wildbench/wildbench_pair_judge.py | 5 +++-- opencompass/summarizers/subjective/wildbench.py | 14 +++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py index a07b1a7d..0dfcb0d8 100644 --- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py +++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.datasets import WildBenchDataset - +from opencompass.summarizers import WildBenchPairSummarizer subjective_reader_cfg = dict( input_columns=['dialogue', 'prompt'], @@ -61,5 +61,6 @@ wildbench_datasets.append( {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}], mode='m2n', # m个模型 与 n个模型进行对战 infer_order='random', - base_models = [llama_2_70b, gpt4, claude] + base_models = [llama_2_70b, gpt4, claude], + summarizer = dict(type=WildBenchPairSummarizer), )) diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py index a07b1a7d..0dfcb0d8 100644 --- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py +++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.datasets import WildBenchDataset - +from opencompass.summarizers import WildBenchPairSummarizer subjective_reader_cfg = dict( input_columns=['dialogue', 'prompt'], @@ -61,5 +61,6 @@ wildbench_datasets.append( {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}], mode='m2n', # m个模型 与 n个模型进行对战 infer_order='random', - base_models = [llama_2_70b, gpt4, claude] + base_models = [llama_2_70b, gpt4, claude], + summarizer = dict(type=WildBenchPairSummarizer), )) diff --git a/opencompass/summarizers/subjective/wildbench.py b/opencompass/summarizers/subjective/wildbench.py index 875b2c3f..98e58cd8 100644 --- a/opencompass/summarizers/subjective/wildbench.py +++ b/opencompass/summarizers/subjective/wildbench.py @@ -156,8 +156,8 @@ class WildBenchPairSummarizer(CompassArenaSummarizer): self.tasks = [] self.cfg = config - self.base_models = self.cfg['eval']['partitioner']['base_models'] - self.compare_models = self.cfg['eval']['partitioner']['compare_models'] + self.base_models = self.cfg['datasets'][0]['base_models'] + self.compare_models = self.cfg['eval']['partitioner']['models'] self.judge_models = self.cfg.get('judge_models', None) self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) @@ -247,8 +247,10 @@ class WildBenchPairSummarizer(CompassArenaSummarizer): pd.DataFrame: The summary results. """ scores = self.get_score(time_str) + all_scores = {} output_dir, results_folder = get_outdir(self.cfg, time_str) for idx, judge_model in enumerate(self.judge_models): + score_by_judgemodel = {} judge_abbr = model_abbr_from_cfg(judge_model) for dataset in self.cfg['datasets']: dataset_abbr = dataset_abbr_from_cfg(dataset) @@ -258,7 +260,7 @@ class WildBenchPairSummarizer(CompassArenaSummarizer): row_headers = [dataset_abbr, 'position_bias'] + row_headers table = [] - for row_header in row_headers: + for idx, row_header in enumerate(row_headers): row = [row_header] headers = [''] for model_cfg in self.compare_models: @@ -276,12 +278,13 @@ class WildBenchPairSummarizer(CompassArenaSummarizer): s = str(s) row.append(s) avg = avg/len(self.base_models) + if idx == 0: + score_by_judgemodel[model_abbr] = {'score': avg} row.append(f'{avg:.2f}') headers.append('Avg') table.append(row) txt = tabulate(table, headers=headers) - print(txt) if idx == len(self.judge_models): output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') @@ -292,4 +295,5 @@ class WildBenchPairSummarizer(CompassArenaSummarizer): f.write(','.join(headers) + '\n') for line in table: f.write(','.join(line) + '\n') - print(output_filename) + all_scores[judge_abbr] = score_by_judgemodel + return {'Wildbench': all_scores}