mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge branch 'open-compass:main' into main
This commit is contained in:
commit
c4aa7825ad
@ -1,7 +1,7 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
|
||||||
from opencompass.datasets.rolebench import InstructionGeneralizationChineseDataset
|
from opencompass.datasets.rolebench import InstructionGeneralizationChineseDataset
|
||||||
|
|
||||||
instruction_generalization_zh_reader_cfg = dict(
|
instruction_generalization_zh_reader_cfg = dict(
|
||||||
@ -27,7 +27,7 @@ instruction_generalization_zh_infer_cfg = dict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
instruction_generalization_zh_eval_cfg = dict(
|
instruction_generalization_zh_eval_cfg = dict(
|
||||||
evaluator=dict(type=RougeEvaluator),
|
evaluator=dict(type=JiebaRougeEvaluator),
|
||||||
pred_role='BOT'
|
pred_role='BOT'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ for _name in subjective_all_sets:
|
|||||||
template="""{dialogue}""",
|
template="""{dialogue}""",
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, temperature=temperature, do_sample=do_sample,infer_mode='every'),
|
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -119,3 +119,10 @@ If you have already download the checkpoints of the model, you can specify the l
|
|||||||
```bash
|
```bash
|
||||||
python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
|
python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Dataset
|
||||||
|
|
||||||
|
### How to build a new dataset?
|
||||||
|
|
||||||
|
- For building new objective dataset: [new_dataset](../advanced_guides/new_dataset.md)
|
||||||
|
- For building new subjective dataset: [subjective_evaluation](../advanced_guides/subjective_evaluation.md)
|
||||||
|
@ -119,3 +119,10 @@ OpenCompass 中的每个任务代表等待评估的特定模型和数据集部
|
|||||||
```bash
|
```bash
|
||||||
python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
|
python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 数据集
|
||||||
|
|
||||||
|
### 如何构建自己的评测数据集
|
||||||
|
|
||||||
|
- 客观数据集构建参见:[支持新数据集](../advanced_guides/new_dataset.md)
|
||||||
|
- 主观数据集构建参见:[主观评测指引](../advanced_guides/subjective_evaluation.md)
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
import argparse
|
import argparse
|
||||||
|
import copy
|
||||||
import getpass
|
import getpass
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
@ -369,6 +369,9 @@ class AlignmentBenchSummarizer:
|
|||||||
if os.path.isdir(subdir_path):
|
if os.path.isdir(subdir_path):
|
||||||
judged_answers, references = get_judgeanswer_and_reference(
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
dataset, subdir_path, self.judge_function)
|
dataset, subdir_path, self.judge_function)
|
||||||
|
if len(judged_answers) == 0:
|
||||||
|
score_by_judgemodel[model] = None
|
||||||
|
continue
|
||||||
if self.judge_type == 'general':
|
if self.judge_type == 'general':
|
||||||
get_dimension_results(judged_answers, references, fout,
|
get_dimension_results(judged_answers, references, fout,
|
||||||
fout_flag, model)
|
fout_flag, model)
|
||||||
|
Loading…
Reference in New Issue
Block a user