mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Doc] Update Subjective docs (#510)
* rename * add en subdoc * fix name * fix writing * update --------- Co-authored-by: Leymore <zfz-960727@163.com>
This commit is contained in:
parent
e3d4901bed
commit
b62842335d
@ -2,21 +2,21 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets.subjectivity_cmp import SubjectivityCmpDataset
|
||||
from opencompass.datasets.subjective_cmp import SubjectiveCmpDataset
|
||||
|
||||
subjectivity_reader_cfg = dict(
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'index', 'reference_answer', 'evaluating_guidance', 'capability', 'prompt'],
|
||||
output_column=None,
|
||||
train_split='test')
|
||||
|
||||
subjectivity_all_sets = [
|
||||
"sub_test",
|
||||
subjective_all_sets = [
|
||||
"subjective_demo",
|
||||
]
|
||||
|
||||
subjectivity_datasets = []
|
||||
subjective_datasets = []
|
||||
|
||||
for _name in subjectivity_all_sets:
|
||||
subjectivity_infer_cfg = dict(
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
@ -30,7 +30,7 @@ for _name in subjectivity_all_sets:
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjectivity_eval_cfg = dict(
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
cmp_order='both',
|
||||
@ -49,13 +49,13 @@ for _name in subjectivity_all_sets:
|
||||
pred_role="BOT",
|
||||
)
|
||||
|
||||
subjectivity_datasets.append(
|
||||
subjective_datasets.append(
|
||||
dict(
|
||||
abbr=f"{_name}",
|
||||
type=SubjectivityCmpDataset,
|
||||
path="./data/subjectivity/",
|
||||
type=SubjectiveCmpDataset,
|
||||
path="./data/subjective/",
|
||||
name=_name,
|
||||
reader_cfg=subjectivity_reader_cfg,
|
||||
infer_cfg=subjectivity_infer_cfg,
|
||||
eval_cfg=subjectivity_eval_cfg
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
))
|
@ -1,9 +1,9 @@
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
|
||||
from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
|
||||
from .summarizers.subjective import summarizer
|
||||
|
||||
datasets = [*subjectivity_datasets]
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
@ -35,22 +35,20 @@ models = [
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
|
||||
trust_remote_code=True),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
model_kwargs=dict(
|
||||
trust_remote_code=True,
|
||||
device_map='auto',
|
||||
revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
|
||||
device_map='auto'),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
),
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-7b-chat-hf',
|
||||
path="/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat",
|
||||
tokenizer_path='/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat',
|
||||
path="Qwen/Qwen-7B-Chat",
|
||||
tokenizer_path='Qwen/Qwen-7B-Chat',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
@ -74,16 +72,14 @@ models = [
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
use_fast=False,
|
||||
trust_remote_code=True,
|
||||
revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
|
||||
trust_remote_code=True),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template2,
|
||||
model_kwargs=dict(
|
||||
trust_remote_code=True,
|
||||
device_map='auto',
|
||||
revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
|
||||
device_map='auto'),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
150
docs/en/advanced_guides/subjective_evaluation.md
Normal file
150
docs/en/advanced_guides/subjective_evaluation.md
Normal file
@ -0,0 +1,150 @@
|
||||
# Subjective Evaluation Guidance
|
||||
|
||||
## Introduction
|
||||
|
||||
Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
|
||||
|
||||
To explore the model's subjective capabilities, we employ state-of-the-art LLM (GPT-4) as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
|
||||
|
||||
A popular evaluation method involves comparing model responses pairwise to calculate their win rate ([Chatbot Arena](https://chat.lmsys.org/)).
|
||||
|
||||
We support the use of GPT-4 for the subjective evaluation of models based on this method.
|
||||
|
||||
## Data Preparation
|
||||
|
||||
We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench).
|
||||
|
||||
Store the set of subjective questions in .xlsx format in the `data/subjective/directory`.
|
||||
|
||||
The table includes the following fields:
|
||||
|
||||
- 'question': Question description
|
||||
- 'index': Question number
|
||||
- 'reference_answer': Reference answer
|
||||
- 'evaluating_guidance': Evaluation guidance
|
||||
- 'capability': The capability dimension of the question.
|
||||
|
||||
## Evaluation Configuration
|
||||
|
||||
The specific process includes:
|
||||
|
||||
1. Model response reasoning
|
||||
2. GPT-4 evaluation comparisons
|
||||
3. Generating evaluation reports
|
||||
|
||||
For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning.
|
||||
|
||||
```python
|
||||
# Import datasets and subjective evaluation summarizer
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
|
||||
from .summarizers.subjective import summarizer
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
|
||||
|
||||
# Import partitioner and task required for subjective evaluation
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
|
||||
|
||||
# Define model configurations for inference and evaluation
|
||||
# Including the inference models chatglm2-6b, qwen-7b-chat, internlm-chat-7b, and the evaluation model gpt4
|
||||
models = [...]
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True)
|
||||
],
|
||||
reserved_roles=[
|
||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||
],
|
||||
)
|
||||
|
||||
# Define the configuration for subjective evaluation
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
mode='all', # alternately constructs two for comparisons
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=2, # Supports parallel comparisons
|
||||
task=dict(
|
||||
type=SubjectiveEvalTask, # Used to read inputs for a pair of models
|
||||
judge_cfg=dict(
|
||||
abbr='GPT4',
|
||||
type=OpenAI,
|
||||
path='gpt-4-0613',
|
||||
key='ENV',
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=2),
|
||||
)),
|
||||
)
|
||||
```
|
||||
|
||||
## Launching the Evaluation
|
||||
|
||||
```shell
|
||||
python run.py config/subjective.py -r
|
||||
```
|
||||
|
||||
The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
|
||||
|
||||
## Evaluation Report
|
||||
|
||||
The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
|
||||
|
||||
```markdown
|
||||
# Subjective Analysis
|
||||
|
||||
A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
|
||||
A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
|
||||
|
||||
### Basic statistics (4 stats: win / tie / lose / not bad)
|
||||
|
||||
| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
|
||||
| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
|
||||
| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
| LANG: EN | N/A | N/A | N/A |
|
||||
| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
|
||||
|
||||
| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
|
||||
| ----------------- | -------------- | --------------- | ------------------- |
|
||||
| LANG: Overall | -8 | 0 | -8 |
|
||||
| LANG: CN | -8 | 0 | -8 |
|
||||
| LANG: EN | N/A | N/A | N/A |
|
||||
| CAPA: common | -8 | 0 | -8 |
|
||||
|
||||
### Bootstrap ELO, Median of n=1000 times
|
||||
|
||||
| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
|
||||
| ---------------- | -------------- | ------------------- | --------------- |
|
||||
| elo_score [Mean] | 999.504 | 999.912 | 1000.26 |
|
||||
| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 |
|
||||
```
|
||||
|
||||
For comparing the evaluation of models A and B, there are four choices:
|
||||
|
||||
1. A is better than B.
|
||||
2. A and B are equally good.
|
||||
3. A is worse than B.
|
||||
4. Neither A nor B is good.
|
||||
|
||||
So, `win` / `tie` / `lose` / `not bad` represent the proportions of the model winning / tying / losing / winning or being equally good, respectively.
|
||||
|
||||
`Bootstrap ELO` is calculated as the median ELO score by comparing match results through 1000 random permutations.
|
@ -64,6 +64,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
|
||||
advanced_guides/multimodal_eval.md
|
||||
advanced_guides/prompt_attack.md
|
||||
advanced_guides/longeval.md
|
||||
advanced_guides/subjective_evaluation.md
|
||||
|
||||
.. _Tools:
|
||||
.. toctree::
|
||||
|
149
docs/zh_cn/advanced_guides/subjective_evaluation.md
Normal file
149
docs/zh_cn/advanced_guides/subjective_evaluation.md
Normal file
@ -0,0 +1,149 @@
|
||||
# 主观评测指引
|
||||
|
||||
## 介绍
|
||||
|
||||
主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。
|
||||
|
||||
为了探究模型的主观能力,我们采用了最先进的 LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。
|
||||
|
||||
流行的评估方法是将模型的回答进行两两比较,以计算其胜率([Chatbot Arena](https://chat.lmsys.org/))。
|
||||
|
||||
我们基于这一方法支持了 GPT4 用于模型的主观能力评估。
|
||||
|
||||
## 数据准备
|
||||
|
||||
我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集:[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。
|
||||
|
||||
将主观问题集以.xlsx 格式存放在 `data/subjective/` 中。
|
||||
|
||||
表格包括以下字段:
|
||||
|
||||
- 'question':问题描述
|
||||
- 'index':题目序号
|
||||
- 'reference_answer':参考答案
|
||||
- 'evaluating_guidance':评估引导
|
||||
- 'capability':题目所属的能力维度。
|
||||
|
||||
## 评测配置
|
||||
|
||||
具体流程包括:
|
||||
|
||||
1. 模型回答的推理
|
||||
2. GPT4 评估比较对
|
||||
3. 生成评测报告
|
||||
|
||||
对于 `config/subjective.py`,我们提供了部分注释,方便用户理解配置文件的含义。
|
||||
|
||||
```python
|
||||
# 导入数据集与主观评测 summarizer
|
||||
from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
|
||||
from .summarizers.subjective import summarizer
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
|
||||
|
||||
#导入主观评测所需 partitioner 与 task
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
|
||||
|
||||
# 定义推理和评测所需模型配置
|
||||
# 包括推理模型 chatglm2-6b,qwen-7b-chat,internlm-chat-7b 和 评测模型 gpt4
|
||||
models = [...]
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True)
|
||||
],
|
||||
reserved_roles=[
|
||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||
],
|
||||
)
|
||||
|
||||
# 定义主观评测配置
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
mode='all', # 新参数,构建比较对时会交替构建两个
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=2, # 支持并行比较
|
||||
task=dict(
|
||||
type=SubjectiveEvalTask, # 新 task,用来读入一对 model 的输入
|
||||
judge_cfg=dict(
|
||||
abbr='GPT4',
|
||||
type=OpenAI,
|
||||
path='gpt-4-0613',
|
||||
key='ENV',
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=2),
|
||||
)),
|
||||
)
|
||||
```
|
||||
|
||||
## 启动评测
|
||||
|
||||
```shell
|
||||
python run.py config/subjective.py -r
|
||||
```
|
||||
|
||||
`-r` 参数支持复用模型推理和 GPT4 评估结果。
|
||||
|
||||
## 评测报告
|
||||
|
||||
评测报告会输出到 `output/.../summary/timestamp/report.md` ,包含胜率统计,对战分数与 ELO。具体格式如下:
|
||||
|
||||
```markdown
|
||||
# Subjective Analysis
|
||||
|
||||
A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
|
||||
A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
|
||||
|
||||
### Basic statistics (4 stats: win / tie / lose / not bad)
|
||||
|
||||
| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
|
||||
| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
|
||||
| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
| LANG: EN | N/A | N/A | N/A |
|
||||
| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
|
||||
|
||||
| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
|
||||
| ----------------- | -------------- | --------------- | ------------------- |
|
||||
| LANG: Overall | -8 | 0 | -8 |
|
||||
| LANG: CN | -8 | 0 | -8 |
|
||||
| LANG: EN | N/A | N/A | N/A |
|
||||
| CAPA: common | -8 | 0 | -8 |
|
||||
|
||||
### Bootstrap ELO, Median of n=1000 times
|
||||
|
||||
| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
|
||||
| ---------------- | -------------- | ------------------- | --------------- |
|
||||
| elo_score [Mean] | 999.504 | 999.912 | 1000.26 |
|
||||
| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 |
|
||||
```
|
||||
|
||||
对于评估模型 A 和 B 的比较对,有四种选择:
|
||||
|
||||
1. A 比 B 好
|
||||
2. A 和 B 一样好
|
||||
3. A 比 B 差
|
||||
4. A 和 B 都不好
|
||||
|
||||
故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。
|
||||
`Bootstrap ELO` 是通过对比赛结果进行 1000 次随机顺序,计算出 ELO 分数的中位数。
|
@ -64,6 +64,7 @@ OpenCompass 上手路线
|
||||
advanced_guides/multimodal_eval.md
|
||||
advanced_guides/prompt_attack.md
|
||||
advanced_guides/longeval.md
|
||||
advanced_guides/subjective_evaluation.md
|
||||
|
||||
.. _工具:
|
||||
.. toctree::
|
||||
|
@ -187,20 +187,20 @@ learns valuable lessons, and discovers his own heroism.
|
||||
|
||||
examples = [example1, example2, example3, example4]
|
||||
|
||||
subjectivity_reader_cfg = dict(input_columns=[
|
||||
subjective_reader_cfg = dict(input_columns=[
|
||||
'question', 'index', 'reference_answer', 'evaluating_guidance',
|
||||
'capability'
|
||||
],
|
||||
output_column=None,
|
||||
train_split='test')
|
||||
output_column=None,
|
||||
train_split='test')
|
||||
|
||||
subjectivity_all_sets = [
|
||||
subjective_all_sets = [
|
||||
'sub_test',
|
||||
]
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class SubjectivityCmpDataset(BaseDataset):
|
||||
class SubjectiveCmpDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, name: str):
|
Loading…
Reference in New Issue
Block a user