[Feature] re-implement ceval load dataset (#446)

2025-05-30 16:03:24 +08:00 · 2023-09-27 21:18:48 +08:00 · 2023-09-27 21:18:48 +08:00 · 9db5652638
commit 9db5652638
parent d9f3e88dfe
2 changed files with 37 additions and 24 deletions
--- a/configs/summarizers/groups/ceval.py
+++ b/configs/summarizers/groups/ceval.py
@ -22,3 +22,26 @@ ceval_summary_groups.append({'name': 'ceval-hard', 'subsets': _ceval_hard})
 _ceval_all = _ceval_stem + _ceval_social_science + _ceval_humanities + _ceval_other
 ceval_summary_groups.append({'name': 'ceval', 'subsets': _ceval_all})
 _ceval_stem = ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics', 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics', 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology', 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine']
 _ceval_stem = ['ceval-test-' + s for s in _ceval_stem]
 ceval_summary_groups.append({'name': 'ceval-test-stem', 'subsets': _ceval_stem})
 _ceval_social_science = ['college_economics', 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification', 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography']
 _ceval_social_science = ['ceval-test-' + s for s in _ceval_social_science]
 ceval_summary_groups.append({'name': 'ceval-test-social-science', 'subsets': _ceval_social_science})
 _ceval_humanities = ['modern_chinese_history', 'ideological_and_moral_cultivation', 'logic', 'law', 'chinese_language_and_literature', 'art_studies', 'professional_tour_guide', 'legal_professional', 'high_school_chinese', 'high_school_history', 'middle_school_history']
 _ceval_humanities = ['ceval-test-' + s for s in _ceval_humanities]
 ceval_summary_groups.append({'name': 'ceval-test-humanities', 'subsets': _ceval_humanities})
 _ceval_other = ['civil_servant', 'sports_science', 'plant_protection', 'basic_medicine', 'clinical_medicine', 'urban_and_rural_planner', 'accountant', 'fire_engineer', 'environmental_impact_assessment_engineer', 'tax_accountant', 'physician']
 _ceval_other = ['ceval-test-' + s for s in _ceval_other]
 ceval_summary_groups.append({'name': 'ceval-test-other', 'subsets': _ceval_other})
 _ceval_hard = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_chemistry', 'college_physics', 'high_school_mathematics', 'high_school_chemistry', 'high_school_physics']
 _ceval_hard = ['ceval-test-' + s for s in _ceval_hard]
 ceval_summary_groups.append({'name': 'ceval-test-hard', 'subsets': _ceval_hard})
 _ceval_all = _ceval_stem + _ceval_social_science + _ceval_humanities + _ceval_other
 ceval_summary_groups.append({'name': 'ceval-test', 'subsets': _ceval_all})
--- a/opencompass/datasets/ceval.py
+++ b/opencompass/datasets/ceval.py
@ -1,6 +1,7 @@
 import csv
 import os.path as osp
-from datasets import DatasetDict, load_dataset
+from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
@ -12,26 +13,15 @@ class CEvalDataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
-        dev_dataset = load_dataset('csv',
+        dataset = {}
-                                   data_files=osp.join(path, 'dev',
+        for split in ['dev', 'val', 'test']:
-                                                       f'{name}_dev.csv'),
+            with open(osp.join(path, split, f'{name}_{split}.csv')) as f:
-                                   split='train')
+                reader = csv.reader(f)
-        val_dataset = load_dataset('csv',
+                header = next(reader)
-                                   data_files=osp.join(path, 'val',
+                for row in reader:
-                                                       f'{name}_val.csv'),
+                    item = dict(zip(header, row))
-                                   split='train')
+                    item.setdefault('explanation', '')
-        val_dataset = val_dataset.add_column('explanation',
+                    item.setdefault('answer', '')
-                                             [''] * len(val_dataset))
+                    dataset.setdefault(split, []).append(item)
-        test_dataset = load_dataset('csv',
+        dataset = {i: Dataset.from_list(dataset[i]) for i in dataset}
-                                    data_files=osp.join(
+        return DatasetDict(dataset)
                                        path, 'test', f'{name}_test.csv'),
                                    split='train')
        test_dataset = test_dataset.add_column(
            'answer',
            [''] * len(test_dataset)).add_column('explanation',
                                                 [''] * len(test_dataset))
        return DatasetDict({
            'val': val_dataset,
            'dev': dev_dataset,
            'test': test_dataset
        })