update earth silver benchmark

2025-05-30 16:03:24 +08:00 · 2025-05-18 19:18:10 +08:00 · 2025-05-18 19:18:10 +08:00 · d9de21a8c3
commit d9de21a8c3
parent 8c0ccf9a6b
3 changed files with 91 additions and 0 deletions
--- a/opencompass/configs/datasets/Earth_Silver/Earth_Silver_gen.py
+++ b/opencompass/configs/datasets/Earth_Silver/Earth_Silver_gen.py
@ -0,0 +1,57 @@
+from opencompass.datasets import Earth_Silver_MCQDataset
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_evaluator import AccEvaluator
+
+
+SYSTEM_PROMPT = 'You are a helpful assistant for answering earth science multiple-choice questions.\n\n'
+
+
+ZERO_SHOT_PROMPT = 'Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, or D) without any explanation or additional text.\n'
+
+
+reader_cfg = dict(
+    input_columns=['question'],  
+    output_column='answer', 
+)
+
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[  
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[  
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT,  
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),  
+    inferencer=dict(type=GenInferencer),  
+)
+
+
+eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',  
+)
+
+
+earth_silver_mcq_dataset = dict(
+    type=Earth_Silver_MCQDataset,  
+    abbr='earth_silver_mcq',  
+    path='ai-earth/Earth-Silver',  
+    prompt_mode='zero-shot', 
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+
+earth_silver_mcq_datasets = [earth_silver_mcq_dataset]
--- a/opencompass/datasets/Earth_Silver.py
+++ b/opencompass/datasets/Earth_Silver.py
@ -0,0 +1,33 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class Earth_Silver_MCQDataset(BaseDataset):
+
+    name = 'msearth_mcq'
+
+    @staticmethod
+    def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
+        """
+        Args:
+            path : HF 标识, 固定写 'MSEarth/MSEarth_MCQ'
+            split: 'train' / 'validation' / 'test'
+            prompt_mode: 'zero-shot' 或 'few-shot'
+        """
+        dataset = load_dataset(path=path)
+
+        dataset = dataset.map(lambda item: {
+            'question': item['question'],
+            'answer': item['answer']
+        })
+
+        if prompt_mode == 'zero-shot':
+            return dataset
+        elif prompt_mode == 'few-shot':
+            raise NotImplementedError('few-shot prompt 尚未实现')
+        else:
+            raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -48,6 +48,7 @@ from .drop import *  # noqa: F401, F403
 from .drop_simple_eval import *  # noqa: F401, F403
 from .ds1000 import *  # noqa: F401, F403
 from .ds1000_interpreter import *  # noqa: F401, F403
+from .Earth_Silver import *  # noqa: F401, F403
 from .eprstmt import *  # noqa: F401, F403
 from .FinanceIQ import *  # noqa: F401, F403
 from .flores import *  # noqa: F401, F403