mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Fix] Fix wildbench (#1508)
* fix pip version * fix pip version * fix_wildbench
This commit is contained in:
parent
a31a77c5c1
commit
c2bcd8725e
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
|||||||
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
||||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
from opencompass.datasets import WildBenchDataset
|
from opencompass.datasets import WildBenchDataset
|
||||||
|
from opencompass.summarizers import WildBenchPairSummarizer
|
||||||
|
|
||||||
subjective_reader_cfg = dict(
|
subjective_reader_cfg = dict(
|
||||||
input_columns=['dialogue', 'prompt'],
|
input_columns=['dialogue', 'prompt'],
|
||||||
@ -61,5 +61,6 @@ wildbench_datasets.append(
|
|||||||
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
|
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
|
||||||
mode='m2n', # m个模型 与 n个模型进行对战
|
mode='m2n', # m个模型 与 n个模型进行对战
|
||||||
infer_order='random',
|
infer_order='random',
|
||||||
base_models = [llama_2_70b, gpt4, claude]
|
base_models = [llama_2_70b, gpt4, claude],
|
||||||
|
summarizer = dict(type=WildBenchPairSummarizer),
|
||||||
))
|
))
|
||||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
|||||||
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
||||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
from opencompass.datasets import WildBenchDataset
|
from opencompass.datasets import WildBenchDataset
|
||||||
|
from opencompass.summarizers import WildBenchPairSummarizer
|
||||||
|
|
||||||
subjective_reader_cfg = dict(
|
subjective_reader_cfg = dict(
|
||||||
input_columns=['dialogue', 'prompt'],
|
input_columns=['dialogue', 'prompt'],
|
||||||
@ -61,5 +61,6 @@ wildbench_datasets.append(
|
|||||||
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
|
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
|
||||||
mode='m2n', # m个模型 与 n个模型进行对战
|
mode='m2n', # m个模型 与 n个模型进行对战
|
||||||
infer_order='random',
|
infer_order='random',
|
||||||
base_models = [llama_2_70b, gpt4, claude]
|
base_models = [llama_2_70b, gpt4, claude],
|
||||||
|
summarizer = dict(type=WildBenchPairSummarizer),
|
||||||
))
|
))
|
||||||
|
@ -156,8 +156,8 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
|
|||||||
self.tasks = []
|
self.tasks = []
|
||||||
self.cfg = config
|
self.cfg = config
|
||||||
|
|
||||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||||
self.judge_models = self.cfg.get('judge_models', None)
|
self.judge_models = self.cfg.get('judge_models', None)
|
||||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||||
@ -247,8 +247,10 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
|
|||||||
pd.DataFrame: The summary results.
|
pd.DataFrame: The summary results.
|
||||||
"""
|
"""
|
||||||
scores = self.get_score(time_str)
|
scores = self.get_score(time_str)
|
||||||
|
all_scores = {}
|
||||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
for idx, judge_model in enumerate(self.judge_models):
|
for idx, judge_model in enumerate(self.judge_models):
|
||||||
|
score_by_judgemodel = {}
|
||||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||||
for dataset in self.cfg['datasets']:
|
for dataset in self.cfg['datasets']:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
@ -258,7 +260,7 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
|
|||||||
row_headers = [dataset_abbr, 'position_bias'] + row_headers
|
row_headers = [dataset_abbr, 'position_bias'] + row_headers
|
||||||
|
|
||||||
table = []
|
table = []
|
||||||
for row_header in row_headers:
|
for idx, row_header in enumerate(row_headers):
|
||||||
row = [row_header]
|
row = [row_header]
|
||||||
headers = ['']
|
headers = ['']
|
||||||
for model_cfg in self.compare_models:
|
for model_cfg in self.compare_models:
|
||||||
@ -276,12 +278,13 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
|
|||||||
s = str(s)
|
s = str(s)
|
||||||
row.append(s)
|
row.append(s)
|
||||||
avg = avg/len(self.base_models)
|
avg = avg/len(self.base_models)
|
||||||
|
if idx == 0:
|
||||||
|
score_by_judgemodel[model_abbr] = {'score': avg}
|
||||||
row.append(f'{avg:.2f}')
|
row.append(f'{avg:.2f}')
|
||||||
headers.append('Avg')
|
headers.append('Avg')
|
||||||
table.append(row)
|
table.append(row)
|
||||||
|
|
||||||
txt = tabulate(table, headers=headers)
|
txt = tabulate(table, headers=headers)
|
||||||
print(txt)
|
|
||||||
|
|
||||||
if idx == len(self.judge_models):
|
if idx == len(self.judge_models):
|
||||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||||
@ -292,4 +295,5 @@ class WildBenchPairSummarizer(CompassArenaSummarizer):
|
|||||||
f.write(','.join(headers) + '\n')
|
f.write(','.join(headers) + '\n')
|
||||||
for line in table:
|
for line in table:
|
||||||
f.write(','.join(line) + '\n')
|
f.write(','.join(line) + '\n')
|
||||||
print(output_filename)
|
all_scores[judge_abbr] = score_by_judgemodel
|
||||||
|
return {'Wildbench': all_scores}
|
||||||
|
Loading…
Reference in New Issue
Block a user