From b69fe2343bb0dc5663fcbf96c89ada68ec232ed8 Mon Sep 17 00:00:00 2001 From: Francis-llgg <102740968+Francis-llgg@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:54:40 +0800 Subject: [PATCH] [Feature] Add GPQA Dataset (#729) * check * message * add * change prompt * change a para nameq * modify name of the file * delete an useless file --- configs/datasets/gpqa/gpqa_gen.py | 4 ++ configs/datasets/gpqa/gpqa_gen_a27c4d.py | 46 ++++++++++++++ configs/eval_llama2_7b.py | 2 +- opencompass/datasets/__init__.py | 1 + opencompass/datasets/gpqa.py | 81 ++++++++++++++++++++++++ 5 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 configs/datasets/gpqa/gpqa_gen.py create mode 100644 configs/datasets/gpqa/gpqa_gen_a27c4d.py create mode 100644 opencompass/datasets/gpqa.py diff --git a/configs/datasets/gpqa/gpqa_gen.py b/configs/datasets/gpqa/gpqa_gen.py new file mode 100644 index 00000000..2f132824 --- /dev/null +++ b/configs/datasets/gpqa/gpqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .gpqa_gen_a27c4d import gpqa_datasets diff --git a/configs/datasets/gpqa/gpqa_gen_a27c4d.py b/configs/datasets/gpqa/gpqa_gen_a27c4d.py new file mode 100644 index 00000000..51d6c52e --- /dev/null +++ b/configs/datasets/gpqa/gpqa_gen_a27c4d.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GPQADataset, GPQAEvaluator +from opencompass.utils import first_option_postprocess + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +gpqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nChoices:\n' + '(A){A}\n' + '(B){B}\n' + '(C){C}\n' + '(D){D}\n' + 'Format your response as follows: "The correct answer is (insert answer here)"'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + +gpqa_datasets = [] +gpqa_subsets = { + 'extended': 'gpqa_extended.csv', + 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/configs/eval_llama2_7b.py b/configs/eval_llama2_7b.py index 2daa293b..e26bbac1 100644 --- a/configs/eval_llama2_7b.py +++ b/configs/eval_llama2_7b.py @@ -5,4 +5,4 @@ with read_base(): from .models.llama.llama2_7b import models -datasets = [*piqa_datasets, *siqa_datasets] +datasets = [*piqa_datasets, *siqa_datasets] \ No newline at end of file diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 8cbb17ae..27b32d1a 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -39,6 +39,7 @@ from .flores import * # noqa: F401, F403 from .game24 import * # noqa: F401, F403 from .GaokaoBench import * # noqa: F401, F403 from .govrepcrs import * # noqa: F401, F403 +from .gpqa import * # noqa: F401, F403 from .gsm8k import * # noqa: F401, F403 from .gsm_hard import * # noqa: F401, F403 from .hellaswag import * # noqa: F401, F403 diff --git a/opencompass/datasets/gpqa.py b/opencompass/datasets/gpqa.py new file mode 100644 index 00000000..1c039647 --- /dev/null +++ b/opencompass/datasets/gpqa.py @@ -0,0 +1,81 @@ +import copy +import csv +import os + +from datasets import Dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class GPQADataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + cnt = 0 + data = [] + data_new = [] + with open(os.path.join(path, name), 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter=',') + for row in reader: + if row[7] == 'Question': + continue + cnt = cnt + 1 + question = row[7] + A = row[8] + B = row[9] + C = row[10] + D = row[11] + options = [row[8], row[9], row[10], row[11]] + answer = 'A' + + data.append({ + 'question': question, + 'A': A, + 'B': B, + 'C': C, + 'D': D, + 'options': options, + 'answer': answer + }) + + circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC'] # 更新选项顺序 + c = circular_patterns[cnt % 4] + line = copy.deepcopy(data[cnt - 1]) + tmp = line['A'] + for i in range(4): + line['ABCD'[i]] = line['options'][ord(c[i]) - ord('A')] + + for i in range(4): + if line['ABCD'[i]] == tmp: + line['answer'] = 'ABCD'[i] + break + data_new.append(line) + + dataset = Dataset.from_list(data_new) + + return dataset + + +class GPQAEvaluator(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different length' + } + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + detail = {'pred': i, 'answer': j, 'correct': False} + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result