diff --git a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py index 70fd48f3..4ede75f8 100644 --- a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py +++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever + ZERO_SHOT_PROMPT = '{q4}' # Reader configuration diff --git a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py index feb699e1..a681efe9 100644 --- a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py +++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py @@ -6,6 +6,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.evaluator import GenericLLMEvaluator ZERO_SHOT_PROMPT = '{q4}' + GRADER_TEMPLATE = """ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. diff --git a/opencompass/datasets/SciKnowEval.py b/opencompass/datasets/SciKnowEval.py index 867f2c7a..d9635d96 100644 --- a/opencompass/datasets/SciKnowEval.py +++ b/opencompass/datasets/SciKnowEval.py @@ -1,23 +1,22 @@ import re -from datasets import Dataset, load_dataset +from datasets import load_dataset from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS -from opencompass.utils import get_logger from .base import BaseDataset def _parse(item, prompt_mode, discipline): choices = item['choices'] - item['q4'] = f'You are an expert in {discipline}.\n' + item['prompt'][ - 'default'] + '\n' + item['question'] + '\n' + '\n'.join([ - f'{l}. {t}' for l, t in zip(choices['label'], choices['text']) - ]) # noqa: E501, E741, E741 - item['start'] = chr(65) - item['end'] = chr(65 + len(item.get('choices', {'label': []})['label']) - - 1) + + item['q4'] = f'You are an expert in {discipline}.\n' + item['q4'] += item['prompt']['default'] + '\n' + item['question'] + '\n' + label_texts = [] + for label_meta, text_meta in zip(choices['label'], choices['text']): + label_texts.append(f'{label_meta}. {text_meta}') + item['q4'] += '\n'.join(label_texts) # noqa: E501, E741, E741 item['prompt_mode'] = prompt_mode return item @@ -34,10 +33,10 @@ class SciKnowEvalDataset(BaseDataset): return s[0].upper() + s[1:] subset = kwargs['subset'] - data_files = { - 'test': - f'data/{capitalize_first_letter(subset)}/sciknoweval_{subset}_test.jsonl' - } + data_files = {} + test_file = f'data/{capitalize_first_letter(subset)}/' + test_file += f'sciknoweval_{subset}_test.jsonl' + data_files['test'] = test_file dataset = load_dataset(path, data_files=data_files, split='test') # dataset = dataset.select(range(20)) if prompt_mode == 'zero-shot':