[Feature] Add GPQA Dataset (#729)

* check * message * add * change prompt * change a para nameq * modify name of the file * delete an useless file
2025-05-30 16:03:24 +08:00 · 2024-01-01 15:54:40 +08:00 · 2024-01-01 15:54:40 +08:00 · b69fe2343b
commit b69fe2343b
parent ef3ae63539
5 changed files with 133 additions and 1 deletions
--- a/configs/datasets/gpqa/gpqa_gen.py
+++ b/configs/datasets/gpqa/gpqa_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .gpqa_gen_a27c4d import gpqa_datasets
--- a/configs/datasets/gpqa/gpqa_gen_a27c4d.py
+++ b/configs/datasets/gpqa/gpqa_gen_a27c4d.py
@ -0,0 +1,46 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GPQADataset, GPQAEvaluator
 from opencompass.utils import first_option_postprocess
 gpqa_reader_cfg = dict(
    input_columns=['question', 'A', 'B', 'C', 'D'],
    output_column='answer')
 gpqa_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nChoices:\n'
                                          '(A){A}\n'
                                          '(B){B}\n'
                                          '(C){C}\n'
                                          '(D){D}\n'
                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
            ], )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
 gpqa_datasets = []
 gpqa_subsets = {
    'extended': 'gpqa_extended.csv',
    'main': 'gpqa_main.csv',
    'diamond': 'gpqa_diamond.csv'
 }
 for split in list(gpqa_subsets.keys()):
    gpqa_datasets.append(
        dict(
            abbr='GPQA_' + split,
            type=GPQADataset,
            path='./data/gpqa/',
            name=gpqa_subsets[split],
            reader_cfg=gpqa_reader_cfg,
            infer_cfg=gpqa_infer_cfg,
            eval_cfg=gpqa_eval_cfg)
    )
--- a/configs/eval_llama2_7b.py
+++ b/configs/eval_llama2_7b.py
@ -5,4 +5,4 @@ with read_base():
    from .models.llama.llama2_7b import models
-datasets = [*piqa_datasets, *siqa_datasets]
+datasets = [*piqa_datasets, *siqa_datasets]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -39,6 +39,7 @@ from .flores import *  # noqa: F401, F403
 from .game24 import *  # noqa: F401, F403
 from .GaokaoBench import *  # noqa: F401, F403
 from .govrepcrs import *  # noqa: F401, F403
 from .gpqa import *  # noqa: F401, F403
 from .gsm8k import *  # noqa: F401, F403
 from .gsm_hard import *  # noqa: F401, F403
 from .hellaswag import *  # noqa: F401, F403
--- a/opencompass/datasets/gpqa.py
+++ b/opencompass/datasets/gpqa.py
@ -0,0 +1,81 @@
 import copy
 import csv
 import os
 from datasets import Dataset
 from opencompass.openicl import BaseEvaluator
 from opencompass.registry import LOAD_DATASET
 from .base import BaseDataset
@LOAD_DATASET.register_module()
 class GPQADataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
        cnt = 0
        data = []
        data_new = []
        with open(os.path.join(path, name), 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                if row[7] == 'Question':
                    continue
                cnt = cnt + 1
                question = row[7]
                A = row[8]
                B = row[9]
                C = row[10]
                D = row[11]
                options = [row[8], row[9], row[10], row[11]]
                answer = 'A'
                data.append({
                    'question': question,
                    'A': A,
                    'B': B,
                    'C': C,
                    'D': D,
                    'options': options,
                    'answer': answer
                })
                circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']  # 更新选项顺序
                c = circular_patterns[cnt % 4]
                line = copy.deepcopy(data[cnt - 1])
                tmp = line['A']
                for i in range(4):
                    line['ABCD'[i]] = line['options'][ord(c[i]) - ord('A')]
                for i in range(4):
                    if line['ABCD'[i]] == tmp:
                        line['answer'] = 'ABCD'[i]
                        break
                data_new.append(line)
        dataset = Dataset.from_list(data_new)
        return dataset
 class GPQAEvaluator(BaseEvaluator):
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different length'
            }
        correct = 0
        count = 0
        details = []
        for i, j in zip(predictions, references):
            detail = {'pred': i, 'answer': j, 'correct': False}
            count += 1
            if i == j:
                correct += 1
                detail['correct'] = True
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result
`@ -5,4 +5,4 @@ with read_base():`
	`from .models.llama.llama2_7b import models`	`from .models.llama.llama2_7b import models`


	`datasets = [piqa_datasets, siqa_datasets]`	`datasets = [piqa_datasets, siqa_datasets]`