mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add other judgelm prompts for Alignbench (#731)
* add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings
This commit is contained in:
parent
54345c56b7
commit
dfd9ac0fd9
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,6 +4,7 @@ outputs/
|
|||||||
icl_inference_output/
|
icl_inference_output/
|
||||||
.vscode/
|
.vscode/
|
||||||
tmp/
|
tmp/
|
||||||
|
configs/eval_subjective_alignbench_test.py
|
||||||
configs/openai_key.py
|
configs/openai_key.py
|
||||||
configs/secrets.py
|
configs/secrets.py
|
||||||
configs/datasets/log.json
|
configs/datasets/log.json
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import AlignmentBenchDataset
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'capability', 'ref'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_all_sets = [
|
||||||
|
"alignment_bench",
|
||||||
|
]
|
||||||
|
data_path ="data/subjective/alignment_bench"
|
||||||
|
|
||||||
|
subjective_datasets = []
|
||||||
|
|
||||||
|
for _name in subjective_all_sets:
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="{question}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
|
||||||
|
|
||||||
|
[BEGIN DATA]
|
||||||
|
***
|
||||||
|
[用户问询]: {question}
|
||||||
|
***
|
||||||
|
[回应]: {prediction}
|
||||||
|
***
|
||||||
|
[参考答案]: {ref}
|
||||||
|
***
|
||||||
|
[END DATA]
|
||||||
|
|
||||||
|
请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
pred_role="BOT",
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f"{_name}",
|
||||||
|
type=AlignmentBenchDataset,
|
||||||
|
path=data_path,
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg
|
||||||
|
))
|
@ -3,10 +3,9 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
|||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
from opencompass.datasets import AlignmentBenchDataset
|
from opencompass.datasets import AlignmentBenchDataset
|
||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
subjective_reader_cfg = dict(
|
subjective_reader_cfg = dict(
|
||||||
input_columns=['question', 'capability', 'prefix', 'suffix'],
|
input_columns=['question', 'capability', 'critiquellm_prefix'],
|
||||||
output_column='judge',
|
output_column='judge',
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -32,7 +31,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
@ -43,7 +42,7 @@ for _name in subjective_all_sets:
|
|||||||
template=dict(round=[
|
template=dict(round=[
|
||||||
dict(
|
dict(
|
||||||
role='HUMAN',
|
role='HUMAN',
|
||||||
prompt = "{prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n"
|
prompt = "{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n"
|
||||||
),
|
),
|
||||||
]),
|
]),
|
||||||
),
|
),
|
@ -0,0 +1,59 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
|
from opencompass.datasets import AlignmentBenchDataset
|
||||||
|
|
||||||
|
subjective_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'capability', 'ref'],
|
||||||
|
output_column='judge',
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_all_sets = [
|
||||||
|
"alignment_bench",
|
||||||
|
]
|
||||||
|
data_path ="data/subjective/alignment_bench"
|
||||||
|
|
||||||
|
subjective_datasets = []
|
||||||
|
|
||||||
|
for _name in subjective_all_sets:
|
||||||
|
subjective_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="{question}"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LMEvaluator,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = """You are a helpful and precise assistant for checking the quality of the answer.\n[Question]\n{question}\n\n[The Start of Assistant 1's Answer]\n{ref}\n\n[The End of Assistant 1's Answer]\n\n[The Start of Assistant 2's Answer]\n{prediction}\n\n[The End of Assistant 2's Answer]\n\n[System]\nWe would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.\n\n### Response:10"""
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
pred_role="BOT",
|
||||||
|
)
|
||||||
|
|
||||||
|
subjective_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f"{_name}",
|
||||||
|
type=AlignmentBenchDataset,
|
||||||
|
path=data_path,
|
||||||
|
name=_name,
|
||||||
|
reader_cfg=subjective_reader_cfg,
|
||||||
|
infer_cfg=subjective_infer_cfg,
|
||||||
|
eval_cfg=subjective_eval_cfg
|
||||||
|
))
|
@ -27,7 +27,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -30,7 +30,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -28,7 +28,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -7,7 +7,10 @@ with read_base():
|
|||||||
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
|
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
|
||||||
from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
|
from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
|
||||||
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
|
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
|
||||||
from .datasets.subjective_cmp.alignment_bench import subjective_datasets
|
from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
|
||||||
|
from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
|
||||||
|
from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
|
||||||
|
from .datasets.subjective_alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||||
|
|
||||||
datasets = [*subjective_datasets]
|
datasets = [*subjective_datasets]
|
||||||
|
|
||||||
|
@ -7,8 +7,7 @@ and its Chinese translation, which can be find in
|
|||||||
https://huggingface.co/GAIR/autoj-bilingual-6b
|
https://huggingface.co/GAIR/autoj-bilingual-6b
|
||||||
'''
|
'''
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='autoj-bilingual-6b',
|
abbr='autoj-bilingual-6b',
|
||||||
path="GAIR/autoj-bilingual-6b",
|
path="GAIR/autoj-bilingual-6b",
|
||||||
@ -22,5 +21,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='autoj-13b-GPTQ-4bits',
|
abbr='autoj-13b-GPTQ-4bits',
|
||||||
path="GAIR/autoj-13b-GPTQ-4bits",
|
path="GAIR/autoj-13b-GPTQ-4bits",
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -6,8 +6,7 @@ which is available on huggingface-hub:
|
|||||||
https://huggingface.co/GAIR/autoj-13b-GPTQ-4bits
|
https://huggingface.co/GAIR/autoj-13b-GPTQ-4bits
|
||||||
'''
|
'''
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='autoj-13b',
|
abbr='autoj-13b',
|
||||||
path="GAIR/autoj-13b",
|
path="GAIR/autoj-13b",
|
||||||
@ -21,5 +20,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='autoj-scenario-classifier',
|
abbr='autoj-scenario-classifier',
|
||||||
path="GAIR/autoj-scenario-classifier",
|
path="GAIR/autoj-scenario-classifier",
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -1,12 +1,11 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='judgelm-13b-v1-hf',
|
abbr='judgelm-13b-v1-hf',
|
||||||
path="BAAI/JudgeLM-13b-v1.0",
|
path="BAAI/JudgeLM-13B-v1.0",
|
||||||
tokenizer_path='BAAI/JudgeLM-13b-v1.0',
|
tokenizer_path='BAAI/JudgeLM-13B-v1.0',
|
||||||
tokenizer_kwargs=dict(padding_side='left',
|
tokenizer_kwargs=dict(padding_side='left',
|
||||||
truncation_side='left',
|
truncation_side='left',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -1,12 +1,11 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='judgelm-33b-v1-hf',
|
abbr='judgelm-33b-v1-hf',
|
||||||
path="BAAI/JudgeLM-33b-v1.0",
|
path="BAAI/JudgeLM-33B-v1.0",
|
||||||
tokenizer_path='BAAI/JudgeLM-33b-v1.0',
|
tokenizer_path='BAAI/JudgeLM-33B-v1.0',
|
||||||
tokenizer_kwargs=dict(padding_side='left',
|
tokenizer_kwargs=dict(padding_side='left',
|
||||||
truncation_side='left',
|
truncation_side='left',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -1,8 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='judgelm-7b-v1-hf',
|
abbr='judgelm-7b-v1-hf',
|
||||||
path="BAAI/JudgeLM-7B-v1.0",
|
path="BAAI/JudgeLM-7B-v1.0",
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -1,8 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='alpaca-pandalm-7b-v1-hf',
|
abbr='alpaca-pandalm-7b-v1-hf',
|
||||||
path="WeOpenML/PandaLM-Alpaca-7B-v1",
|
path="WeOpenML/PandaLM-Alpaca-7B-v1",
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -1,8 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
models = [
|
models = [dict(
|
||||||
dict(
|
|
||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='pandalm-7b-v1-hf',
|
abbr='pandalm-7b-v1-hf',
|
||||||
path="WeOpenML/PandaLM-7B-v1",
|
path="WeOpenML/PandaLM-7B-v1",
|
||||||
@ -16,5 +15,4 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)]
|
||||||
]
|
|
@ -2,6 +2,7 @@
|
|||||||
import json
|
import json
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import re
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from datasets import Dataset, DatasetDict
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
@ -83,16 +84,25 @@ def prompt_construct(sample, config: Config):
|
|||||||
@LOAD_DATASET.register_module()
|
@LOAD_DATASET.register_module()
|
||||||
class AlignmentBenchDataset(SubjectiveCmpDataset):
|
class AlignmentBenchDataset(SubjectiveCmpDataset):
|
||||||
|
|
||||||
def load(self, path: str, name: str, alignment_bench_config_path: str,
|
def load(self,
|
||||||
alignment_bench_config_name: str):
|
path: str,
|
||||||
alignmentbenchconfig = Config(alignment_bench_config_path,
|
name: str,
|
||||||
alignment_bench_config_name)
|
alignment_bench_config_path: Optional[str] = '',
|
||||||
|
alignment_bench_config_name: Optional[str] = ''):
|
||||||
|
if alignment_bench_config_path != '':
|
||||||
|
alignmentbench_config = Config(alignment_bench_config_path,
|
||||||
|
alignment_bench_config_name)
|
||||||
|
else:
|
||||||
|
alignmentbench_config = None
|
||||||
dataset = list(super().load(path, name))
|
dataset = list(super().load(path, name))
|
||||||
corev2_dataset = []
|
corev2_dataset = []
|
||||||
for data in dataset:
|
for data in dataset:
|
||||||
dimensions, prefix = prompt_construct(data, alignmentbenchconfig)
|
if alignmentbench_config:
|
||||||
data['prefix'], data['suffix'] = prefix, ''
|
dimensions, prefix = prompt_construct(data,
|
||||||
|
alignmentbench_config)
|
||||||
|
data['critiquellm_prefix'] = prefix
|
||||||
data['judge']['others'] = data['others']
|
data['judge']['others'] = data['others']
|
||||||
|
data['ref'] = data['others']['reference']
|
||||||
corev2_dataset.append(data)
|
corev2_dataset.append(data)
|
||||||
dataset = Dataset.from_list(corev2_dataset)
|
dataset = Dataset.from_list(corev2_dataset)
|
||||||
return dataset
|
return dataset
|
||||||
@ -108,5 +118,5 @@ if __name__ == '__main__':
|
|||||||
'question_id': 1
|
'question_id': 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
prefix = prompt_construct(data, alignmentbenchconfig)
|
prefix = prompt_construct(data, alignmentbench_config)
|
||||||
print(prefix)
|
print(prefix)
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
from .alignmentbench import AlignmentBenchSummarizer # noqa: F401
|
# flake8: noqa: F401, E501
|
||||||
|
from .alignmentbench import (AlignmentBenchSummarizer, AutojSummarizer,
|
||||||
|
JudgeLMSummarizer)
|
||||||
from .circular import CircularSummarizer # noqa: F401
|
from .circular import CircularSummarizer # noqa: F401
|
||||||
from .corev2 import Corev2Summarizer # noqa: F401
|
from .corev2 import Corev2Summarizer # noqa: F401
|
||||||
from .creationv01 import Creationv01Summarizer # noqa: F401
|
from .creationv01 import Creationv01Summarizer # noqa: F401
|
||||||
|
@ -6,7 +6,6 @@ import re
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import mmengine
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from mmengine import ConfigDict
|
from mmengine import ConfigDict
|
||||||
|
|
||||||
@ -15,7 +14,9 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from_csv = None
|
from_csv = None
|
||||||
|
|
||||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
from opencompass.utils import model_abbr_from_cfg
|
||||||
|
|
||||||
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
CATEGORIES = {
|
CATEGORIES = {
|
||||||
'中文推理': ['数学计算', '逻辑推理'],
|
'中文推理': ['数学计算', '逻辑推理'],
|
||||||
@ -28,7 +29,12 @@ all_dimensions = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def post_process(judgment: str):
|
def post_process(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
xxx{'事实正确性': 1, '满足用户需求': 1, '清晰度': 2, '完备性': 1, '综合得分': 1}xxx,
|
||||||
|
and extract each score
|
||||||
|
"""
|
||||||
|
|
||||||
def extract_rating(text):
|
def extract_rating(text):
|
||||||
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
|
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
|
||||||
@ -61,13 +67,13 @@ def post_process(judgment: str):
|
|||||||
return None
|
return None
|
||||||
return rating
|
return rating
|
||||||
|
|
||||||
judgment = judgment.replace('\n', '')
|
judgement = judgement.replace('\n', '')
|
||||||
rating = extract_rating(judgment)
|
rating = extract_rating(judgement)
|
||||||
|
|
||||||
if rating is not None:
|
if rating is not None:
|
||||||
score = rating.get('综合得分', -1)
|
score = rating.get('综合得分', -1)
|
||||||
if score == -1:
|
if score == -1:
|
||||||
score = extract_score(judgment)
|
score = extract_score(judgement)
|
||||||
if score >= 0 and score <= 10:
|
if score >= 0 and score <= 10:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@ -75,7 +81,127 @@ def post_process(judgment: str):
|
|||||||
rating = check_rating(rating)
|
rating = check_rating(rating)
|
||||||
else:
|
else:
|
||||||
score = -1
|
score = -1
|
||||||
return rating, score
|
if rating == None or score == -1:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return {'rating': rating, 'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_autoj(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
xxx[[5]]xxx, and extract the score
|
||||||
|
"""
|
||||||
|
pattern = r'\[(\d+)\]'
|
||||||
|
matched_result = re.findall(pattern, judgement)
|
||||||
|
if matched_result:
|
||||||
|
score = int(matched_result[0])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
return {'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_judgelm(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
5, reason:xxx and extract the score
|
||||||
|
"""
|
||||||
|
if len(judgement) >= 2:
|
||||||
|
first_two_chars = judgement[:2]
|
||||||
|
if first_two_chars.isdigit() and first_two_chars == '10':
|
||||||
|
score = 10
|
||||||
|
else:
|
||||||
|
first_char = judgement[0]
|
||||||
|
if first_char.isdigit() and 0 <= int(first_char) <= 9:
|
||||||
|
score = int(first_char)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
elif len(judgement) == 1:
|
||||||
|
if judgement.isdigit() and 0 <= int(judgement) <= 9:
|
||||||
|
score = int(judgement)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
return {'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
def get_dimension_results(judged_answers, references, fout, fout_flag, model):
|
||||||
|
dimension_ratings = defaultdict(int)
|
||||||
|
dimension_counts = defaultdict(int)
|
||||||
|
for ans, ref in zip(judged_answers, references):
|
||||||
|
for k, v in ans['rating'].items():
|
||||||
|
if k != '综合得分':
|
||||||
|
dimension_ratings[k] += v
|
||||||
|
dimension_counts[k] += 1
|
||||||
|
dimension_ratings['综合得分'] += ans['score']
|
||||||
|
dimension_counts['综合得分'] += 1
|
||||||
|
|
||||||
|
dimension_avg_ratings = defaultdict(float)
|
||||||
|
for dimension, total_score in dimension_ratings.items():
|
||||||
|
dimension_avg_ratings[
|
||||||
|
dimension] = total_score / dimension_counts[dimension]
|
||||||
|
|
||||||
|
scores = {model: dimension_avg_ratings}
|
||||||
|
rows = list(scores.keys())
|
||||||
|
columns = list(scores[rows[0]].keys())
|
||||||
|
with open(fout, 'a+', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
if fout_flag == 0:
|
||||||
|
writer.writerow(['模型'] + columns)
|
||||||
|
fout_flag += 1
|
||||||
|
for row in rows:
|
||||||
|
writer.writerow([row] +
|
||||||
|
[scores[row][column] for column in columns])
|
||||||
|
|
||||||
|
|
||||||
|
def get_capability_results(judged_answers, references, fout, fout_flag, model):
|
||||||
|
capability_ratings = defaultdict(int)
|
||||||
|
capability_counts = defaultdict(int)
|
||||||
|
for ans, ref in zip(judged_answers, references):
|
||||||
|
capability_ratings[ref['capability']] += ans['score']
|
||||||
|
capability_counts[ref['capability']] += 1
|
||||||
|
|
||||||
|
capability_avg_ratings = defaultdict(float)
|
||||||
|
|
||||||
|
for capability, total_score in capability_ratings.items():
|
||||||
|
capability_avg_ratings[
|
||||||
|
capability] = total_score / capability_counts[capability]
|
||||||
|
|
||||||
|
capability_avg_ratings['中文推理总分'] = np.mean(
|
||||||
|
[np.mean(capability_avg_ratings[cat]) for cat in CATEGORIES['中文推理']])
|
||||||
|
capability_avg_ratings['中文语言总分'] = np.mean(
|
||||||
|
[np.mean(capability_avg_ratings[cat]) for cat in CATEGORIES['中文语言']])
|
||||||
|
capability_avg_ratings['总分'] = (capability_avg_ratings['中文推理总分'] +
|
||||||
|
capability_avg_ratings['中文语言总分']) / 2
|
||||||
|
|
||||||
|
scores = {model: capability_avg_ratings}
|
||||||
|
with open(fout, 'a+', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
if fout_flag == 0:
|
||||||
|
num_header = [str(i) for i in range(12)]
|
||||||
|
writer.writerow(num_header)
|
||||||
|
|
||||||
|
header = ['模型', '总分']
|
||||||
|
for category, sub_categories in CATEGORIES.items():
|
||||||
|
header.append(category)
|
||||||
|
header.extend([None for _ in range(len(sub_categories))])
|
||||||
|
writer.writerow(header)
|
||||||
|
|
||||||
|
sub_header = ['模型', '总分']
|
||||||
|
for category, sub_categories in CATEGORIES.items():
|
||||||
|
sub_header.extend([category + '总分'])
|
||||||
|
sub_header.extend(sub_categories)
|
||||||
|
writer.writerow(sub_header)
|
||||||
|
fout_flag += 1
|
||||||
|
|
||||||
|
row = [model]
|
||||||
|
row.append(scores[model]['总分'])
|
||||||
|
for category, sub_categories in CATEGORIES.items():
|
||||||
|
row.append(scores[model][category + '总分'])
|
||||||
|
for sub_category in sub_categories:
|
||||||
|
row.append(scores[model][sub_category])
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
class AlignmentBenchSummarizer:
|
class AlignmentBenchSummarizer:
|
||||||
@ -93,7 +219,7 @@ class AlignmentBenchSummarizer:
|
|||||||
self.eval_model_abbrs = [
|
self.eval_model_abbrs = [
|
||||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||||
]
|
]
|
||||||
self.judge_abbr = self.cfg['judge_model']['abbr']
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
@ -105,18 +231,8 @@ class AlignmentBenchSummarizer:
|
|||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: The summary results.
|
pd.DataFrame: The summary results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
dataset_cfgs = self.cfg['datasets']
|
dataset_cfgs = self.cfg['datasets']
|
||||||
work_dir = self.cfg['work_dir']
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
self.work_dir = work_dir
|
|
||||||
|
|
||||||
self.time_str = time_str
|
|
||||||
output_path = osp.join(self.work_dir, 'summary',
|
|
||||||
f'summary_{self.time_str}.txt')
|
|
||||||
output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
|
|
||||||
mmengine.mkdir_or_exist(output_dir)
|
|
||||||
results_folder = osp.join(work_dir, 'results')
|
|
||||||
|
|
||||||
fout_flag, fout_flag2 = 0, 0
|
fout_flag, fout_flag2 = 0, 0
|
||||||
for eval_model_abbr in self.eval_model_abbrs:
|
for eval_model_abbr in self.eval_model_abbrs:
|
||||||
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||||
@ -129,137 +245,12 @@ class AlignmentBenchSummarizer:
|
|||||||
output_dir,
|
output_dir,
|
||||||
'judged-by--' + judge_model + '-capability.csv')
|
'judged-by--' + judge_model + '-capability.csv')
|
||||||
for dataset in dataset_cfgs:
|
for dataset in dataset_cfgs:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
filename = os.path.join(subdir_path,
|
dataset, subdir_path, post_process)
|
||||||
dataset_abbr + '.json')
|
get_dimension_results(judged_answers, references, fout,
|
||||||
partial_filename = os.path.join(subdir_path,
|
fout_flag, model)
|
||||||
dataset_abbr + '_0.json')
|
get_capability_results(judged_answers, references, fout2,
|
||||||
if osp.exists(osp.realpath(filename)):
|
fout_flag2, model)
|
||||||
result = mmengine.load(filename)
|
|
||||||
elif osp.exists(osp.realpath(partial_filename)):
|
|
||||||
filename = partial_filename
|
|
||||||
result = {}
|
|
||||||
i = 1
|
|
||||||
partial_dict_flag = 0
|
|
||||||
while osp.exists(osp.realpath(filename)):
|
|
||||||
res = mmengine.load(filename)
|
|
||||||
for k, v in res.items():
|
|
||||||
result[partial_dict_flag] = v
|
|
||||||
partial_dict_flag += 1
|
|
||||||
filename = os.path.join(
|
|
||||||
subdir_path,
|
|
||||||
dataset_abbr + '_' + str(i) + '.json')
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
if len(result) == 0:
|
|
||||||
print('*' * 100)
|
|
||||||
print('There are no results for ' + filename + ' or ' +
|
|
||||||
partial_filename)
|
|
||||||
print('*' * 100)
|
|
||||||
assert len(result > 0)
|
|
||||||
|
|
||||||
judged_answers = []
|
|
||||||
references = []
|
|
||||||
for k, v in result.items():
|
|
||||||
rating, score = post_process(v['prediction'])
|
|
||||||
if rating is not None and score != -1:
|
|
||||||
judged_answers.append({
|
|
||||||
'rating': rating,
|
|
||||||
'score': score
|
|
||||||
})
|
|
||||||
references.append(v['gold'])
|
|
||||||
print(
|
|
||||||
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
|
|
||||||
)
|
|
||||||
if len(judged_answers) == 0:
|
|
||||||
print('*' * 100)
|
|
||||||
print(
|
|
||||||
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
|
||||||
)
|
|
||||||
print('*' * 100)
|
|
||||||
assert len(judged_answers) > 0
|
|
||||||
|
|
||||||
dimension_ratings = defaultdict(int)
|
|
||||||
dimension_counts = defaultdict(int)
|
|
||||||
capability_ratings = defaultdict(int)
|
|
||||||
capability_counts = defaultdict(int)
|
|
||||||
for ans, ref in zip(judged_answers, references):
|
|
||||||
for k, v in ans['rating'].items():
|
|
||||||
if k != '综合得分':
|
|
||||||
dimension_ratings[k] += v
|
|
||||||
dimension_counts[k] += 1
|
|
||||||
dimension_ratings['综合得分'] += ans['score']
|
|
||||||
dimension_counts['综合得分'] += 1
|
|
||||||
capability_ratings[ref['capability']] += ans['score']
|
|
||||||
capability_counts[ref['capability']] += 1
|
|
||||||
|
|
||||||
dimension_avg_ratings = defaultdict(float)
|
|
||||||
capability_avg_ratings = defaultdict(float)
|
|
||||||
for dimension, total_score in dimension_ratings.items():
|
|
||||||
dimension_avg_ratings[
|
|
||||||
dimension] = total_score / dimension_counts[
|
|
||||||
dimension]
|
|
||||||
|
|
||||||
for capability, total_score in capability_ratings.items():
|
|
||||||
capability_avg_ratings[
|
|
||||||
capability] = total_score / capability_counts[
|
|
||||||
capability]
|
|
||||||
|
|
||||||
capability_avg_ratings['中文推理总分'] = np.mean([
|
|
||||||
np.mean(capability_avg_ratings[cat])
|
|
||||||
for cat in CATEGORIES['中文推理']
|
|
||||||
])
|
|
||||||
capability_avg_ratings['中文语言总分'] = np.mean([
|
|
||||||
np.mean(capability_avg_ratings[cat])
|
|
||||||
for cat in CATEGORIES['中文语言']
|
|
||||||
])
|
|
||||||
capability_avg_ratings['总分'] = (
|
|
||||||
capability_avg_ratings['中文推理总分'] +
|
|
||||||
capability_avg_ratings['中文语言总分']) / 2
|
|
||||||
|
|
||||||
scores = {model: dimension_avg_ratings}
|
|
||||||
rows = list(scores.keys())
|
|
||||||
columns = list(scores[rows[0]].keys())
|
|
||||||
with open(fout, 'a+', newline='') as csvfile:
|
|
||||||
writer = csv.writer(csvfile)
|
|
||||||
if fout_flag == 0:
|
|
||||||
writer.writerow(['模型'] + columns)
|
|
||||||
fout_flag += 1
|
|
||||||
for row in rows:
|
|
||||||
writer.writerow(
|
|
||||||
[row] +
|
|
||||||
[scores[row][column] for column in columns])
|
|
||||||
|
|
||||||
scores = {model: capability_avg_ratings}
|
|
||||||
with open(fout2, 'a+', newline='') as csvfile:
|
|
||||||
writer = csv.writer(csvfile)
|
|
||||||
if fout_flag2 == 0:
|
|
||||||
num_header = [str(i) for i in range(12)]
|
|
||||||
writer.writerow(num_header)
|
|
||||||
|
|
||||||
header = ['模型', '总分']
|
|
||||||
for category, sub_categories in CATEGORIES.items():
|
|
||||||
header.append(category)
|
|
||||||
header.extend(
|
|
||||||
[None for _ in range(len(sub_categories))])
|
|
||||||
writer.writerow(header)
|
|
||||||
|
|
||||||
sub_header = ['模型', '总分']
|
|
||||||
for category, sub_categories in CATEGORIES.items():
|
|
||||||
sub_header.extend([category + '总分'])
|
|
||||||
sub_header.extend(sub_categories)
|
|
||||||
writer.writerow(sub_header)
|
|
||||||
fout_flag2 += 1
|
|
||||||
|
|
||||||
row = [model]
|
|
||||||
row.append(scores[model]['总分'])
|
|
||||||
for category, sub_categories in CATEGORIES.items():
|
|
||||||
row.append(scores[model][category + '总分'])
|
|
||||||
for sub_category in sub_categories:
|
|
||||||
row.append(scores[model][sub_category])
|
|
||||||
writer.writerow(row)
|
|
||||||
else:
|
else:
|
||||||
print(subdir_path + ' is not exist! please check!')
|
print(subdir_path + ' is not exist! please check!')
|
||||||
with open(fout, 'r') as f:
|
with open(fout, 'r') as f:
|
||||||
@ -268,3 +259,73 @@ class AlignmentBenchSummarizer:
|
|||||||
with open(fout2, 'r') as f:
|
with open(fout2, 'r') as f:
|
||||||
x = from_csv(f)
|
x = from_csv(f)
|
||||||
print(x)
|
print(x)
|
||||||
|
|
||||||
|
|
||||||
|
class AutojSummarizer(AlignmentBenchSummarizer):
|
||||||
|
"""Do the subjectivity analyze based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (ConfigDict): The configuration object of the evaluation task.
|
||||||
|
It's expected to be filled out at runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ConfigDict) -> None:
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
def summarize(self,
|
||||||
|
post_process=post_process_autoj,
|
||||||
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
|
"""Summarize the subjectivity analysis based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
time_str (str): Timestamp for file naming.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The summary results.
|
||||||
|
"""
|
||||||
|
dataset_cfgs = self.cfg['datasets']
|
||||||
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
|
fout_flag = 0
|
||||||
|
for eval_model_abbr in self.eval_model_abbrs:
|
||||||
|
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||||
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
|
if os.path.isdir(subdir_path):
|
||||||
|
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||||
|
fout = osp.join(
|
||||||
|
output_dir,
|
||||||
|
'judged-by--' + judge_model + '-capability.csv')
|
||||||
|
for dataset in dataset_cfgs:
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
dataset, subdir_path, post_process)
|
||||||
|
get_capability_results(judged_answers, references, fout,
|
||||||
|
fout_flag, model)
|
||||||
|
else:
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
|
with open(fout, 'r') as f:
|
||||||
|
x = from_csv(f)
|
||||||
|
print(x)
|
||||||
|
|
||||||
|
|
||||||
|
class JudgeLMSummarizer(AutojSummarizer):
|
||||||
|
"""Do the subjectivity analyze based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (ConfigDict): The configuration object of the evaluation task.
|
||||||
|
It's expected to be filled out at runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ConfigDict) -> None:
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
def summarize(self,
|
||||||
|
post_process=post_process_judgelm,
|
||||||
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
|
"""Summarize the subjectivity analysis based on evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
time_str (str): Timestamp for file naming.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The summary results.
|
||||||
|
"""
|
||||||
|
super().summarize(post_process, time_str)
|
||||||
|
@ -16,7 +16,7 @@ except ImportError:
|
|||||||
from_csv = None
|
from_csv = None
|
||||||
|
|
||||||
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||||
from opencompass.utils import dataset_abbr_from_cfg
|
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||||
|
|
||||||
|
|
||||||
def match_general_answer(s):
|
def match_general_answer(s):
|
||||||
@ -58,7 +58,7 @@ class Corev2Summarizer:
|
|||||||
self.match_method = match_method
|
self.match_method = match_method
|
||||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||||
self.judge_abbr = self.cfg['judge_model']['abbr']
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
|
77
opencompass/summarizers/utils.py
Normal file
77
opencompass/summarizers/utils.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
import os.path as osp
|
||||||
|
|
||||||
|
import mmengine
|
||||||
|
|
||||||
|
from opencompass.utils import dataset_abbr_from_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def get_outdir(cfg, time_str):
|
||||||
|
"""Get out put path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cfg (ConfigDict): The running config.
|
||||||
|
time_str (str): Current time.
|
||||||
|
"""
|
||||||
|
work_dir = cfg['work_dir']
|
||||||
|
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
|
||||||
|
output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
|
||||||
|
mmengine.mkdir_or_exist(output_dir)
|
||||||
|
results_folder = osp.join(work_dir, 'results')
|
||||||
|
return output_dir, results_folder
|
||||||
|
|
||||||
|
|
||||||
|
def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
|
||||||
|
"""Extract judgements (scores) and references.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (ConfigDict): Dataset config.
|
||||||
|
subdir_path (str): Model path in results dir.
|
||||||
|
post_process (function): The pre-defined extract function.
|
||||||
|
"""
|
||||||
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
|
filename = osp.join(subdir_path, dataset_abbr + '.json')
|
||||||
|
partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
|
||||||
|
if osp.exists(osp.realpath(filename)):
|
||||||
|
result = mmengine.load(filename)
|
||||||
|
elif osp.exists(osp.realpath(partial_filename)):
|
||||||
|
filename = partial_filename
|
||||||
|
result = {}
|
||||||
|
i = 1
|
||||||
|
partial_dict_flag = 0
|
||||||
|
while osp.exists(osp.realpath(filename)):
|
||||||
|
res = mmengine.load(filename)
|
||||||
|
for k, v in res.items():
|
||||||
|
result[partial_dict_flag] = v
|
||||||
|
partial_dict_flag += 1
|
||||||
|
filename = osp.join(subdir_path,
|
||||||
|
dataset_abbr + '_' + str(i) + '.json')
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
if len(result) == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print('There are no results for ' + filename + ' or ' +
|
||||||
|
partial_filename)
|
||||||
|
print('*' * 100)
|
||||||
|
assert len(result) > 0
|
||||||
|
|
||||||
|
judged_answers = []
|
||||||
|
references = []
|
||||||
|
for k, v in result.items():
|
||||||
|
processed_judge = post_process(v['prediction'])
|
||||||
|
if processed_judge is not None:
|
||||||
|
judged_answers.append(processed_judge)
|
||||||
|
references.append(v['gold'])
|
||||||
|
print(
|
||||||
|
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
|
||||||
|
)
|
||||||
|
if len(judged_answers) == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print(
|
||||||
|
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
||||||
|
)
|
||||||
|
print('*' * 100)
|
||||||
|
assert len(judged_answers) > 0
|
||||||
|
return judged_answers, references
|
@ -14,7 +14,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
|
|||||||
from opencompass.tasks.base import BaseTask
|
from opencompass.tasks.base import BaseTask
|
||||||
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
|
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
|
||||||
get_infer_output_path, get_logger,
|
get_infer_output_path, get_logger,
|
||||||
task_abbr_from_cfg)
|
model_abbr_from_cfg, task_abbr_from_cfg)
|
||||||
|
|
||||||
|
|
||||||
class SubjectiveEvalTask(BaseTask):
|
class SubjectiveEvalTask(BaseTask):
|
||||||
@ -35,6 +35,16 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
super().__init__(cfg)
|
super().__init__(cfg)
|
||||||
self.logger = get_logger()
|
self.logger = get_logger()
|
||||||
judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
|
judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
|
||||||
|
if type(judge_cfg) != ConfigDict:
|
||||||
|
print('*' * 100)
|
||||||
|
print('Due to different Judge model needs different summarizer and'
|
||||||
|
" prompts, we don't support multi judge model evaluation at "
|
||||||
|
'one time, please do not use list to set your judge cfg, jus'
|
||||||
|
't use a dict or list[0] should be fine. If you want to eval'
|
||||||
|
'uation multi judge model in one script, we suggest you to u'
|
||||||
|
'se a bash or bat script to start multi configs evaluation!')
|
||||||
|
print('*' * 100)
|
||||||
|
assert type(judge_cfg) == ConfigDict
|
||||||
run_cfg = judge_cfg.get('run_cfg', {})
|
run_cfg = judge_cfg.get('run_cfg', {})
|
||||||
self.num_gpus = run_cfg.get('num_gpus', 0)
|
self.num_gpus = run_cfg.get('num_gpus', 0)
|
||||||
self.num_procs = run_cfg.get('num_procs', 1)
|
self.num_procs = run_cfg.get('num_procs', 1)
|
||||||
@ -63,16 +73,14 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
# model_cfg can be a list of model configs
|
# model_cfg can be a list of model configs
|
||||||
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
|
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
|
||||||
for dataset_cfg in dataset_cfgs:
|
for dataset_cfg in dataset_cfgs:
|
||||||
# self.model_cfg = model_cfg
|
|
||||||
# self.dataset_cfg = dataset_cfg
|
|
||||||
|
|
||||||
# Load Dataset
|
# Load Dataset
|
||||||
eval_cfg = dataset_cfg.get('eval_cfg')
|
eval_cfg = dataset_cfg.get('eval_cfg')
|
||||||
output_column = dataset_cfg['reader_cfg']['output_column']
|
output_column = dataset_cfg['reader_cfg']['output_column']
|
||||||
if type(model_cfg) == ConfigDict:
|
if type(model_cfg) == ConfigDict:
|
||||||
model_cfg = (model_cfg, )
|
model_cfg = (model_cfg, )
|
||||||
model_cfg += ({
|
model_cfg += ({
|
||||||
'abbr': 'judged-by--' + self.judge_cfg['abbr']
|
'abbr':
|
||||||
|
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
|
||||||
}, )
|
}, )
|
||||||
out_path = get_infer_output_path(
|
out_path = get_infer_output_path(
|
||||||
model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
|
model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
|
||||||
@ -142,7 +150,10 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
|
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
|
||||||
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
||||||
return {'model_name': model_cfg['abbr'], 'model_preds': pred_strs}
|
return {
|
||||||
|
'model_name': model_abbr_from_cfg(model_cfg),
|
||||||
|
'model_preds': pred_strs
|
||||||
|
}
|
||||||
|
|
||||||
def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
|
def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
|
||||||
test_set = build_dataset_from_cfg(dataset_cfg).test
|
test_set = build_dataset_from_cfg(dataset_cfg).test
|
||||||
@ -241,7 +252,10 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
for dataset in datasets:
|
for dataset in datasets:
|
||||||
if type(model) == ConfigDict:
|
if type(model) == ConfigDict:
|
||||||
model = (model, )
|
model = (model, )
|
||||||
model += ({'abbr': 'judged-by--' + self.judge_cfg['abbr']}, )
|
model += ({
|
||||||
|
'abbr':
|
||||||
|
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
|
||||||
|
}, )
|
||||||
output_paths.append(
|
output_paths.append(
|
||||||
get_infer_output_path(
|
get_infer_output_path(
|
||||||
model, dataset,
|
model, dataset,
|
||||||
|
Loading…
Reference in New Issue
Block a user