[Fix] modelscope dataset load problem (#1406)

* fix modelscope dataset load * fix lint
2025-05-30 16:03:24 +08:00 · 2024-08-08 14:01:06 +08:00 · 2024-08-08 14:01:06 +08:00 · 818d72a650
commit 818d72a650
parent 264fd23129
6 changed files with 18 additions and 21 deletions
--- a/opencompass/datasets/ceval.py
+++ b/opencompass/datasets/ceval.py
@ -20,9 +20,7 @@ class CEvalDataset(BaseDataset):
        dataset = {}
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
-            dataset = MsDataset.load(dataset_name=path,
-                                     subset_name=name,
-                                     trust_remote_code=True)
+            dataset = MsDataset.load(dataset_name=path, subset_name=name)
        else:
            for split in ['dev', 'val', 'test']:
                filename = osp.join(path, split, f'{name}_{split}.csv')
--- a/opencompass/datasets/cmmlu.py
+++ b/opencompass/datasets/cmmlu.py
@ -18,9 +18,10 @@ class CMMLUDataset(BaseDataset):
        path = get_data_path(path)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
-            dataset = MsDataset.load(path,
-                                     subset_name=name,
-                                     trust_remote_code=True)
+            dataset = MsDataset.load(
+                path,
+                subset_name=name,
+            )
            modified_dataset = DatasetDict()
            for split in dataset.keys():
                raw_data = []
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@ -20,7 +20,7 @@ class GSM8KDataset(BaseDataset):
        path = get_data_path(path)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
-            dataset = MsDataset.load(dataset_name=path, trust_remote_code=True)
+            dataset = MsDataset.load(dataset_name=path)
        else:
            datasets = {}
            for split in ['train', 'test']:
--- a/opencompass/datasets/strategyqa.py
+++ b/opencompass/datasets/strategyqa.py
@ -34,9 +34,7 @@ class StrategyQADataset(BaseDataset):

        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
-            dataset = MsDataset.load('opencompass/strategy_qa',
-                                     split='train',
-                                     trust_remote_code=True)
+            dataset = MsDataset.load(path, split='train')
        else:
            with open(path, 'r', encoding='utf-8') as f:
                dataset = json.load(f)
--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@ -21,8 +21,7 @@ class WinograndeDataset(BaseDataset):
            from modelscope import MsDataset
            ms_dataset = MsDataset.load(path,
                                        subset_name='winogrande_xs',
-                                        split='validation',
-                                        trust_remote_code=True)
+                                        split='validation')
            dataset_list = []
            for line in ms_dataset:
                prompt = line['sentence']
@ -71,8 +70,7 @@ class WinograndeDatasetV2(BaseDataset):
            from modelscope import MsDataset
            ms_dataset = MsDataset.load(path,
                                        subset_name='winogrande_xs',
-                                        split='validation',
-                                        trust_remote_code=True)
+                                        split='validation')
            dataset_list = []
            for line in ms_dataset:
                prompt = line['sentence']
@ -127,8 +125,7 @@ class WinograndeDatasetV3(BaseDataset):
            for split in ['train', 'validation']:
                ms_dataset = MsDataset.load(path,
                                            subset_name='winogrande_xs',
-                                            split=split,
-                                            trust_remote_code=True)
+                                            split=split)
                dataset_list = []
                for line in ms_dataset:
                    prompt = line['sentence']
--- a/tests/dataset/test_ms_datasets.py
+++ b/tests/dataset/test_ms_datasets.py
@ -34,7 +34,6 @@ def reload_datasets():
        from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets
        
        from configs.datasets.mmlu.mmlu_gen import mmlu_datasets
-        from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets
        from configs.datasets.bbh.bbh_gen import bbh_datasets
        from configs.datasets.Xsum.Xsum_gen import Xsum_datasets
        from configs.datasets.winogrande.winogrande_gen import winogrande_datasets
@ -51,15 +50,17 @@ def reload_datasets():
        from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets
        from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets
        
+        from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets
+        from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets
+        from configs.datasets.lcsts.lcsts_gen import lcsts_datasets
+        
        from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets
        from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets
        from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets
        from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets
-        from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets
        from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets
        from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
        from configs.datasets.nq.nq_gen import nq_datasets
-        from configs.datasets.lcsts.lcsts_gen import lcsts_datasets
        from configs.datasets.math.math_gen import math_datasets
        from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets
        from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets
@ -105,7 +106,8 @@ def load_datasets(source, conf):
        return dataset
    try:
        dataset = conf['type'].load(path=conf['path'])
-    except Exception:
+    except Exception as e:
+        print(e)
        dataset = conf['type'].load(**conf)
    return dataset

@ -147,7 +149,7 @@ class TestingMsDatasets(unittest.TestCase):
                print(exception)
                return 'failure', f'{modelscope_path_name} is not the same as {local_path_name}'
        
-        with ThreadPoolExecutor(16) as executor:
+        with ThreadPoolExecutor(thread) as executor:
            futures = {
                executor.submit(compare_datasets, ms_conf, local_conf): (ms_conf, local_conf)
                for ms_conf, local_conf in zip(ms_datasets_conf, local_datasets_conf)
@ -220,4 +222,5 @@ def _check_data(ms_dataset: Dataset | DatasetDict,

 if __name__ == '__main__':
    sample_size = 100
+    thread = 1
    unittest.main()