mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add CompassArena (#828)
* add compass arena * add compass_arena * add compass arena * Update opencompass/summarizers/subjective/compass_arena.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update opencompass/summarizers/subjective/__init__.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update opencompass/datasets/subjective/compass_arena.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update opencompass/datasets/subjective/__init__.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update configs/eval_subjective_compassarena.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update configs/datasets/subjective/compassarena/compassarena_compare.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update configs/eval_subjective_compassarena.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update configs/datasets/subjective/compassarena/compassarena_compare.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * fix check position bias --------- Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com>
This commit is contained in:
parent
40a2441deb
commit
2d4da8dd02
160
configs/datasets/subjective/compassarena/compassarena_compare.py
Normal file
160
configs/datasets/subjective/compassarena/compassarena_compare.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import CompassArenaDataset
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'ref'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
data_path ="data/subjective/"
|
||||||
|
|
||||||
|
subjective_datasets = []
|
||||||
|
|
||||||
|
base_prompt = """
|
||||||
|
|
||||||
|
[回答1开始]
|
||||||
|
{prediction}
|
||||||
|
[回答1结束]
|
||||||
|
|
||||||
|
[回答2开始]
|
||||||
|
{prediction2}
|
||||||
|
[回答2结束]
|
||||||
|
|
||||||
|
根据评分要求,在以下 3 个选项中做出选择:
|
||||||
|
A. 回答1更好
|
||||||
|
B. 回答2更好
|
||||||
|
C. 回答1、2平局
|
||||||
|
并提供你的解释原因。
|
||||||
|
|
||||||
|
如果你认为回答1更好,你的输出应形如:
|
||||||
|
选择:A
|
||||||
|
原因:blahblah blahblah\n
|
||||||
|
|
||||||
|
如果你认为回答2更好,你的输出应形如:
|
||||||
|
选择:B
|
||||||
|
原因:blahblah blahblah\n
|
||||||
|
|
||||||
|
如果你认为回答1、2打成平手,你的输出应形如:
|
||||||
|
选择:C
|
||||||
|
原因:blahblah blahblah\n
|
||||||
|
"""
|
||||||
|
|
||||||
|
knowledge_prompt = """
|
||||||
|
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
|
||||||
|
评分要求(重要性依次递减):
|
||||||
|
1. 更好的回答能与参考答案吻合或表明参考答案的意思。
|
||||||
|
2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。
|
||||||
|
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。
|
||||||
|
|
||||||
|
[用户问题]
|
||||||
|
{question}
|
||||||
|
|
||||||
|
[参考答案]
|
||||||
|
{ref}
|
||||||
|
""" + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
language_prompt = """
|
||||||
|
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
|
||||||
|
评分要求(重要性依次递减):
|
||||||
|
1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。
|
||||||
|
2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等
|
||||||
|
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。
|
||||||
|
|
||||||
|
[用户问题]
|
||||||
|
{question}
|
||||||
|
|
||||||
|
[参考答案]
|
||||||
|
{ref}
|
||||||
|
""" + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
math_prompt = """
|
||||||
|
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
|
||||||
|
评分要求(重要性依次递减):
|
||||||
|
1. 更好的回答的答案能和参考答案一致。
|
||||||
|
2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。
|
||||||
|
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。
|
||||||
|
|
||||||
|
[用户问题]
|
||||||
|
{question}
|
||||||
|
|
||||||
|
[参考答案]
|
||||||
|
{ref}
|
||||||
|
""" + base_prompt
|
||||||
|
|
||||||
|
reason_prompt = math_prompt
|
||||||
|
|
||||||
|
qa_prompt = """
|
||||||
|
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
|
||||||
|
评分要求(重要性依次递减):
|
||||||
|
1. 好的回答必须首先具有事实正确性,即除了想象的内容外,所引用或阐述的各种信息都是真实正确的
|
||||||
|
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答,且前后连贯,逻辑没有问题
|
||||||
|
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误
|
||||||
|
|
||||||
|
[用户问题]
|
||||||
|
{question}
|
||||||
|
""" + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
creation_prompt = """
|
||||||
|
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
|
||||||
|
评分要求(重要性依次递减):
|
||||||
|
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题
|
||||||
|
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答
|
||||||
|
3. 好的回答必须具有创造性的词语和表达丰富度
|
||||||
|
|
||||||
|
[用户问题]
|
||||||
|
{question}
|
||||||
|
""" + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
subjective_all_sets = ["knowledge", "language", "math", "reason", "qa", "creationv2_zh"]
|
||||||
|
prompt_all_sets = [knowledge_prompt, language_prompt, math_prompt, reason_prompt, qa_prompt, creation_prompt]
|
||||||
|
|
||||||
|
for _name,_prompt in zip(subjective_all_sets, prompt_all_sets):
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="{question}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
infer_order='double',
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = _prompt
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
pred_role="BOT",
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f"{_name}",
|
||||||
|
type=CompassArenaDataset,
|
||||||
|
path=data_path,
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg
|
||||||
|
))
|
95
configs/eval_subjective_compassarena.py
Normal file
95
configs/eval_subjective_compassarena.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
from os import getenv as gv
|
||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
from mmengine.config import read_base
|
||||||
|
with read_base():
|
||||||
|
from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
|
||||||
|
from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
|
||||||
|
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
|
||||||
|
|
||||||
|
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||||
|
from opencompass.models.openai_api import OpenAIAllesAPIN
|
||||||
|
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||||
|
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||||
|
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||||
|
from opencompass.runners import LocalRunner
|
||||||
|
from opencompass.runners import SlurmSequentialRunner
|
||||||
|
from opencompass.tasks import OpenICLInferTask
|
||||||
|
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||||
|
from opencompass.summarizers import CompassArenaSummarizer
|
||||||
|
|
||||||
|
infer = dict(
|
||||||
|
#partitioner=dict(type=NaivePartitioner),
|
||||||
|
partitioner=dict(type=SizePartitioner, max_task_size=10000),
|
||||||
|
runner=dict(
|
||||||
|
type=SlurmSequentialRunner,
|
||||||
|
partition='llm_dev2',
|
||||||
|
quotatype='auto',
|
||||||
|
max_num_workers=256,
|
||||||
|
task=dict(type=OpenICLInferTask)),
|
||||||
|
)
|
||||||
|
|
||||||
|
api_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
gpt4 = dict(
|
||||||
|
abbr='gpt4-turbo',
|
||||||
|
type=OpenAI, path='gpt-4-1106-preview',
|
||||||
|
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||||
|
meta_template=api_meta_template,
|
||||||
|
query_per_second=1,
|
||||||
|
max_out_len=2048,
|
||||||
|
max_seq_len=4096,
|
||||||
|
batch_size=4,
|
||||||
|
retry=20,
|
||||||
|
temperature = 1
|
||||||
|
)
|
||||||
|
models = [*chatglm3_6b_32k_model, *yi_6b_chat_model]
|
||||||
|
datasets = [*subjective_datasets]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
work_dir = 'outputs/compass_arena/'
|
||||||
|
|
||||||
|
# -------------Inferen Stage ----------------------------------------
|
||||||
|
|
||||||
|
judge_model = dict(
|
||||||
|
abbr='GPT4-Turbo',
|
||||||
|
type=OpenAI, path='gpt-4-1106-preview',
|
||||||
|
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||||
|
meta_template=api_meta_template,
|
||||||
|
query_per_second=1,
|
||||||
|
max_out_len=1024,
|
||||||
|
max_seq_len=4096,
|
||||||
|
batch_size=2,
|
||||||
|
retry=20,
|
||||||
|
temperature = 0
|
||||||
|
)
|
||||||
|
## ------------- Evaluation Configuration
|
||||||
|
eval = dict(
|
||||||
|
partitioner=dict(
|
||||||
|
type=SubjectiveSizePartitioner,
|
||||||
|
strategy='split',
|
||||||
|
max_task_size=10000,
|
||||||
|
mode='m2n',
|
||||||
|
base_models = [gpt4],
|
||||||
|
compare_models = [*chatglm3_6b_32k_model, *yi_6b_chat_model, ]
|
||||||
|
),
|
||||||
|
runner=dict(
|
||||||
|
type=SlurmSequentialRunner,
|
||||||
|
partition='llm_dev2',
|
||||||
|
quotatype='auto',
|
||||||
|
max_num_workers=32,
|
||||||
|
task=dict(
|
||||||
|
type=SubjectiveEvalTask,
|
||||||
|
judge_cfg=judge_model
|
||||||
|
)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
type=CompassArenaSummarizer
|
||||||
|
)
|
@ -1,4 +1,5 @@
|
|||||||
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
|
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
|
||||||
|
from .compass_arena import CompassArenaDataset # noqa: F401, F403
|
||||||
from .corev2 import Corev2Dataset # noqa: F401, F403
|
from .corev2 import Corev2Dataset # noqa: F401, F403
|
||||||
from .creationbench import CreationBenchDataset # noqa: F401, F403
|
from .creationbench import CreationBenchDataset # noqa: F401, F403
|
||||||
from .information_retrival import IRDataset # noqa: F401, F403
|
from .information_retrival import IRDataset # noqa: F401, F403
|
||||||
|
28
opencompass/datasets/subjective/compass_arena.py
Normal file
28
opencompass/datasets/subjective/compass_arena.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
|
||||||
|
from .subjective_cmp import SubjectiveCmpDataset
|
||||||
|
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class CompassArenaDataset(SubjectiveCmpDataset):
|
||||||
|
|
||||||
|
def load(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
name: str,
|
||||||
|
):
|
||||||
|
dataset = list(super().load(path, name))
|
||||||
|
creation_dataset = []
|
||||||
|
for data in dataset:
|
||||||
|
if 'reference' in data['others']:
|
||||||
|
if data['others']['reference'] is not None:
|
||||||
|
data['ref'] = data['others']['reference']
|
||||||
|
else:
|
||||||
|
data['ref'] = '满足用户需求,言之有理即可'
|
||||||
|
else:
|
||||||
|
data['ref'] = '满足用户需求,言之有理即可'
|
||||||
|
creation_dataset.append(data)
|
||||||
|
dataset = Dataset.from_list(creation_dataset)
|
||||||
|
return dataset
|
@ -26,7 +26,8 @@ class SubjectiveCmpDataset(BaseDataset):
|
|||||||
'capability': capability,
|
'capability': capability,
|
||||||
'others': others,
|
'others': others,
|
||||||
'judge': {
|
'judge': {
|
||||||
'capability': capability
|
'capability': capability,
|
||||||
|
'question': question
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
dataset = Dataset.from_list(raw_data)
|
dataset = Dataset.from_list(raw_data)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# flake8: noqa: F401, E501
|
# flake8: noqa: F401, E501
|
||||||
from .alignmentbench import AlignmentBenchSummarizer
|
from .alignmentbench import AlignmentBenchSummarizer
|
||||||
|
from .compass_arena import CompassArenaSummarizer
|
||||||
from .corev2 import Corev2Summarizer
|
from .corev2 import Corev2Summarizer
|
||||||
from .creationbench import CreationBenchSummarizer
|
from .creationbench import CreationBenchSummarizer
|
||||||
from .information_retrival import IRSummarizer
|
from .information_retrival import IRSummarizer
|
||||||
|
174
opencompass/summarizers/subjective/compass_arena.py
Normal file
174
opencompass/summarizers/subjective/compass_arena.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
import ast
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import os.path as osp
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
|
import mmengine
|
||||||
|
from mmengine import ConfigDict
|
||||||
|
from prettytable import from_csv
|
||||||
|
|
||||||
|
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||||
|
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||||
|
|
||||||
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_compass_arena(s):
|
||||||
|
if result := re.findall('(?:选择:|Choice: )([ABC])', s):
|
||||||
|
return result[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_position_bias(judged_answers, references, banned_choice=['C']):
|
||||||
|
"""Check position bias for judgellm's judgement.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
judged_answers: The successfully extracted judgement.
|
||||||
|
references: The references contains original question, which is used to located the same question for different position judgement.
|
||||||
|
"""
|
||||||
|
position_bias_flag = 0
|
||||||
|
position_bias_dict = {}
|
||||||
|
for judge, ref in zip(judged_answers, references):
|
||||||
|
question = ref['others']['question']
|
||||||
|
question_hash = hash(question)
|
||||||
|
if question_hash not in position_bias_dict:
|
||||||
|
position_bias_dict[question_hash] = {
|
||||||
|
'question': question,
|
||||||
|
'judge': judge
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
first_judge = position_bias_dict[question_hash]['judge']
|
||||||
|
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
|
||||||
|
# If second choice is same with first choice, there has position bias.
|
||||||
|
position_bias_flag += 1
|
||||||
|
return position_bias_flag
|
||||||
|
|
||||||
|
|
||||||
|
class CompassArenaSummarizer:
|
||||||
|
"""Do the subjectivity analyze based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (ConfigDict): The configuration object of the evaluation task.
|
||||||
|
It's expected to be filled out at runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ConfigDict, judge_type='general') -> None:
|
||||||
|
self.tasks = []
|
||||||
|
self.cfg = config
|
||||||
|
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||||
|
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||||
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||||
|
self.judge_type = judge_type
|
||||||
|
assert self.judge_type in ['general']
|
||||||
|
self.judge_map = {
|
||||||
|
'general': post_process_compass_arena,
|
||||||
|
}
|
||||||
|
self.judge_function = self.judge_map[self.judge_type]
|
||||||
|
|
||||||
|
def summarize(self,
|
||||||
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
||||||
|
check_pos_bias=True):
|
||||||
|
"""Summarize the subjectivity analysis based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
time_str (str): Timestamp for file naming.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The summary results.
|
||||||
|
"""
|
||||||
|
dataset_cfgs = self.cfg['datasets']
|
||||||
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
|
model_combinations = list(
|
||||||
|
product(self.base_models, self.compare_models))
|
||||||
|
unique_combinations = remove_duplicate_pairs(
|
||||||
|
[combo for combo in model_combinations if combo[0] != combo[1]])
|
||||||
|
fout_list = []
|
||||||
|
for model_pair in unique_combinations:
|
||||||
|
model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][
|
||||||
|
'abbr'], self.judge_abbr
|
||||||
|
subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr
|
||||||
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
|
if os.path.isdir(subdir_path):
|
||||||
|
for dataset in dataset_cfgs:
|
||||||
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
|
fout = osp.join(
|
||||||
|
output_dir, 'judged-by--' + judge_model + '-' +
|
||||||
|
dataset_abbr + '-report.csv')
|
||||||
|
fout_list.append(fout)
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
dataset,
|
||||||
|
subdir_path,
|
||||||
|
self.judge_function,
|
||||||
|
)
|
||||||
|
if check_pos_bias:
|
||||||
|
bias_num = check_position_bias(judged_answers,
|
||||||
|
references)
|
||||||
|
else:
|
||||||
|
bias_num = 0
|
||||||
|
win_model1, win_model2, categories = defaultdict(
|
||||||
|
float), defaultdict(float), defaultdict(float)
|
||||||
|
model1, model2 = references[0]['answer1'], references[0][
|
||||||
|
'answer2']
|
||||||
|
for prediction, reference in zip(judged_answers,
|
||||||
|
references):
|
||||||
|
if dataset_abbr == 'zhihu_hot_0113':
|
||||||
|
reference['capability'] = 'QA'
|
||||||
|
categories['total'] += 1
|
||||||
|
categories[reference['capability']] += 1
|
||||||
|
if prediction == 'A':
|
||||||
|
if reference['answer1'] == model1:
|
||||||
|
win_model1[reference['capability']] += 1
|
||||||
|
win_model1['total'] += 1
|
||||||
|
else:
|
||||||
|
win_model2[reference['capability']] += 1
|
||||||
|
win_model2['total'] += 1
|
||||||
|
elif prediction == 'B':
|
||||||
|
if reference['answer1'] == model1:
|
||||||
|
win_model2[reference['capability']] += 1
|
||||||
|
win_model2['total'] += 1
|
||||||
|
else:
|
||||||
|
win_model1[reference['capability']] += 1
|
||||||
|
win_model1['total'] += 1
|
||||||
|
for capability in categories:
|
||||||
|
if capability not in win_model1:
|
||||||
|
win_model1[capability] = 0.0
|
||||||
|
else:
|
||||||
|
win_model1[capability] = round(
|
||||||
|
(win_model1[capability] /
|
||||||
|
categories[capability]) * 100, 2)
|
||||||
|
if capability not in win_model2:
|
||||||
|
win_model2[capability] = 0.0
|
||||||
|
else:
|
||||||
|
win_model2[capability] = round(
|
||||||
|
(win_model2[capability] /
|
||||||
|
categories[capability]) * 100, 2)
|
||||||
|
win_model1['position_bias'] = bias_num
|
||||||
|
win_model2['position_bias'] = bias_num
|
||||||
|
scores = {
|
||||||
|
'win_' + model1: win_model1,
|
||||||
|
'win_' + model2: win_model2
|
||||||
|
}
|
||||||
|
rows = list(scores.keys())
|
||||||
|
columns = list(scores[rows[0]].keys())
|
||||||
|
columns.insert(0, columns.pop(columns.index('total')))
|
||||||
|
columns.insert(1,
|
||||||
|
columns.pop(columns.index('position_bias')))
|
||||||
|
with open(fout, 'a+', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
writer.writerow([model1 + '_vs_' + model2] + columns)
|
||||||
|
for row in rows:
|
||||||
|
writer.writerow(
|
||||||
|
[row] +
|
||||||
|
[scores[row][column] for column in columns])
|
||||||
|
else:
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
|
for fout in fout_list:
|
||||||
|
with open(fout, 'r') as f:
|
||||||
|
x = from_csv(f)
|
||||||
|
print(x)
|
Loading…
Reference in New Issue
Block a user