diff --git a/configs/datasets/rolebench/instruction_generalization_zh.py b/configs/datasets/rolebench/instruction_generalization_zh.py index 646aa7a1..5f53e98d 100644 --- a/configs/datasets/rolebench/instruction_generalization_zh.py +++ b/configs/datasets/rolebench/instruction_generalization_zh.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import RougeEvaluator +from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator from opencompass.datasets.rolebench import InstructionGeneralizationChineseDataset instruction_generalization_zh_reader_cfg = dict( @@ -27,7 +27,7 @@ instruction_generalization_zh_infer_cfg = dict( ) instruction_generalization_zh_eval_cfg = dict( - evaluator=dict(type=RougeEvaluator), + evaluator=dict(type=JiebaRougeEvaluator), pred_role='BOT' ) diff --git a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py index f5563f20..57ee2750 100644 --- a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py +++ b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py @@ -26,7 +26,7 @@ for _name in subjective_all_sets: template="""{dialogue}""", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, temperature=temperature, do_sample=do_sample,infer_mode='every'), + inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'), ) subjective_eval_cfg = dict( diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md index 570605c7..5d5117e1 100644 --- a/docs/en/get_started/faq.md +++ b/docs/en/get_started/faq.md @@ -119,3 +119,10 @@ If you have already download the checkpoints of the model, you can specify the l ```bash python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model ``` + +## Dataset + +### How to build a new dataset? + +- For building new objective dataset: [new_dataset](../advanced_guides/new_dataset.md) +- For building new subjective dataset: [subjective_evaluation](../advanced_guides/subjective_evaluation.md) diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md index 44e0f9ea..30d34676 100644 --- a/docs/zh_cn/get_started/faq.md +++ b/docs/zh_cn/get_started/faq.md @@ -119,3 +119,10 @@ OpenCompass 中的每个任务代表等待评估的特定模型和数据集部 ```bash python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model ``` + +## 数据集 + +### 如何构建自己的评测数据集 + +- 客观数据集构建参见:[支持新数据集](../advanced_guides/new_dataset.md) +- 主观数据集构建参见:[主观评测指引](../advanced_guides/subjective_evaluation.md) diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index bb47221e..d1539d5c 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -1,6 +1,7 @@ # flake8: noqa # yapf: disable import argparse +import copy import getpass import os import os.path as osp diff --git a/opencompass/summarizers/subjective/alignmentbench.py b/opencompass/summarizers/subjective/alignmentbench.py index 48fc7111..ce357d89 100644 --- a/opencompass/summarizers/subjective/alignmentbench.py +++ b/opencompass/summarizers/subjective/alignmentbench.py @@ -369,6 +369,9 @@ class AlignmentBenchSummarizer: if os.path.isdir(subdir_path): judged_answers, references = get_judgeanswer_and_reference( dataset, subdir_path, self.judge_function) + if len(judged_answers) == 0: + score_by_judgemodel[model] = None + continue if self.judge_type == 'general': get_dimension_results(judged_answers, references, fout, fout_flag, model)