From 12b84aeb3bf5bd1f89a0b1287db5ee435b4c3511 Mon Sep 17 00:00:00 2001 From: jxd <44057635+jxd0712@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:42:30 +0800 Subject: [PATCH 1/2] [Feature] Update CHARM Memeorziation (#1230) * update gemini api and add gemini models * add openai models * update CHARM evaluation * add CHARM memorization tasks * add CharmMemSummarizer (output eval details for memorization-independent reasoning analysis * update CHARM readme --------- Co-authored-by: wujiang --- configs/datasets/CHARM/README.md | 62 +++++- configs/datasets/CHARM/README_ZH.md | 62 +++++- .../datasets/CHARM/charm_memory_gen_bbbd53.py | 63 ++++++ .../datasets/CHARM/charm_memory_settings.py | 31 +++ configs/eval_charm_mem.py | 94 ++++++++ configs/{eval_charm.py => eval_charm_rea.py} | 64 ++++-- configs/models/gemini/gemini_1_5_flash.py | 22 ++ configs/models/gemini/gemini_1_5_pro.py | 22 ++ configs/models/gemini/gemini_pro.py | 3 +- configs/models/openai/gpt_3_5_turbo_0125.py | 20 ++ configs/models/openai/gpt_4o_2024_05_13.py | 20 ++ opencompass/datasets/charm.py | 100 ++++++++- opencompass/models/gemini_api.py | 31 ++- .../summarizers/subjective/__init__.py | 1 + opencompass/summarizers/subjective/charm.py | 208 ++++++++++++++++++ 15 files changed, 762 insertions(+), 41 deletions(-) create mode 100644 configs/datasets/CHARM/charm_memory_gen_bbbd53.py create mode 100644 configs/datasets/CHARM/charm_memory_settings.py create mode 100644 configs/eval_charm_mem.py rename configs/{eval_charm.py => eval_charm_rea.py} (62%) create mode 100644 configs/models/gemini/gemini_1_5_flash.py create mode 100644 configs/models/gemini/gemini_1_5_pro.py create mode 100644 configs/models/openai/gpt_3_5_turbo_0125.py create mode 100644 configs/models/openai/gpt_4o_2024_05_13.py create mode 100644 opencompass/summarizers/subjective/charm.py diff --git a/configs/datasets/CHARM/README.md b/configs/datasets/CHARM/README.md index b297f852..a89b2475 100644 --- a/configs/datasets/CHARM/README.md +++ b/configs/datasets/CHARM/README.md @@ -86,15 +86,69 @@ Below are the steps for quickly downloading CHARM and using OpenCompass for eval ### 1. Download CHARM ```bash git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo} + +cd ${path_to_opencompass} +mkdir data +ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM ``` ### 2. Run Inference and Evaluation ```bash cd ${path_to_opencompass} -mkdir -p data -ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM -# Infering and evaluating CHARM with hf_llama3_8b_instruct model -python run.py --models hf_llama3_8b_instruct --datasets charm_gen +# modify config file `configs/eval_charm_rea.py`: uncomment or add models you want to evaluate +python run.py configs/eval_charm_rea.py -r --dump-eval-details + +# modify config file `configs/eval_charm_mem.py`: uncomment or add models you want to evaluate +python run.py configs/eval_charm_mem.py -r --dump-eval-details +``` +The inference and evaluation results would be in `${path_to_opencompass}/outputs`, like this: +```bash +outputs +├── CHARM_mem +│ └── chat +│ └── 20240605_151442 +│ ├── predictions +│ │ ├── internlm2-chat-1.8b-turbomind +│ │ ├── llama-3-8b-instruct-lmdeploy +│ │ └── qwen1.5-1.8b-chat-hf +│ ├── results +│ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125 +│ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125 +│ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125 +│   └── summary +│   └── 20240605_205020 # MEMORY_SUMMARY_DIR +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding +│   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV +└── CHARM_rea + └── chat + └── 20240605_152359 + ├── predictions + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + ├── results # REASON_RESULTS_DIR + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + └── summary + ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV + └── summary_20240605_205328.txt +``` +### 3. Generate Analysis Results +```bash +cd ${path_to_CHARM_repo} + +# generate Table5, Table6, Table9 and Table10 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV} + +# generate Figure3 and Figure9 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV} + +# generate Table7, Table12, Table13 and Figure11 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV} ``` ## 🖊️ Citation diff --git a/configs/datasets/CHARM/README_ZH.md b/configs/datasets/CHARM/README_ZH.md index 414c65fe..c5381030 100644 --- a/configs/datasets/CHARM/README_ZH.md +++ b/configs/datasets/CHARM/README_ZH.md @@ -84,15 +84,69 @@ ### 1. 下载 CHARM ```bash git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo} + +cd ${path_to_opencompass} +mkdir data +ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM ``` ### 2. 推理和评测 ```bash cd ${path_to_opencompass} -mkdir -p data -ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM -# 在CHARM上对模型hf_llama3_8b_instruct做推理和评测 -python run.py --models hf_llama3_8b_instruct --datasets charm_gen +# 修改配置文件`configs/eval_charm_rea.py`: 将现有的模型取消注释,或者添加你想评测的模型 +python run.py configs/eval_charm_rea.py -r --dump-eval-details + +# 修改配置文件`configs/eval_charm_mem.py`: 将现有的模型取消注释,或者添加你想评测的模型 +python run.py configs/eval_charm_mem.py -r --dump-eval-details +``` +推理和评测的结果位于路径`${path_to_opencompass}/outputs`, 如下所示: +```bash +outputs +├── CHARM_mem +│ └── chat +│ └── 20240605_151442 +│ ├── predictions +│ │ ├── internlm2-chat-1.8b-turbomind +│ │ ├── llama-3-8b-instruct-lmdeploy +│ │ └── qwen1.5-1.8b-chat-hf +│ ├── results +│ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125 +│ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125 +│ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125 +│   └── summary +│   └── 20240605_205020 # MEMORY_SUMMARY_DIR +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding +│   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV +└── CHARM_rea + └── chat + └── 20240605_152359 + ├── predictions + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + ├── results # REASON_RESULTS_DIR + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + └── summary + ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV + └── summary_20240605_205328.txt +``` +### 3. 生成分析结果 +```bash +cd ${path_to_CHARM_repo} + +# 生成论文中的Table5, Table6, Table9 and Table10,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV} + +# 生成论文中的Figure3 and Figure9,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV} + +# 生成论文中的Table7, Table12, Table13 and Figure11,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV} ``` ## 🖊️ 引用 diff --git a/configs/datasets/CHARM/charm_memory_gen_bbbd53.py b/configs/datasets/CHARM/charm_memory_gen_bbbd53.py new file mode 100644 index 00000000..2617dcee --- /dev/null +++ b/configs/datasets/CHARM/charm_memory_gen_bbbd53.py @@ -0,0 +1,63 @@ +import os +from mmengine.config import read_base + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CharmDataset, CharmMemoryEvaluator, LMEvaluator + +with read_base(): + from .charm_memory_settings import charm_memory_tasks, judge_system_prompts, dataset_path + +charm_memory_datasets = [] + +for _task in charm_memory_tasks: + + charm_memory_reader_cfg = dict(input_columns=['input'], + output_column='target') + + charm_memory_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='请尽可能简短地回答下述问题。\n问题:{input}\n答:') + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + if _task == 'Chinese_Movie_and_Music_Recommendation': + charm_memory_eval_cfg = dict( + evaluator=dict(type=CharmMemoryEvaluator), + pred_role='BOT', + ) + else: + judge_system_prompt = judge_system_prompts[_task] + charm_memory_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=judge_system_prompt + + "\n\n[Question]\n{input}\n[The Start of Reference Answer]\n{target}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{prediction}\n[The End of Assistant's Answer]" # noqa + ), + ]), + ), + ), + pred_role='BOT', + ) + + charm_memory_datasets.append( + dict( + type=CharmDataset, + path=dataset_path, + name=_task, + abbr='charm-memory-' + _task, + reader_cfg=charm_memory_reader_cfg, + infer_cfg=charm_memory_infer_cfg.copy(), + eval_cfg=charm_memory_eval_cfg.copy(), + )) diff --git a/configs/datasets/CHARM/charm_memory_settings.py b/configs/datasets/CHARM/charm_memory_settings.py new file mode 100644 index 00000000..12f722d7 --- /dev/null +++ b/configs/datasets/CHARM/charm_memory_settings.py @@ -0,0 +1,31 @@ +import os + +charm_memory_tasks = [ + 'Chinese_Anachronisms_Judgment', + 'Chinese_Movie_and_Music_Recommendation', + 'Chinese_Sport_Understanding', + 'Chinese_Time_Understanding', +] + +dataset_path = 'data/CHARM/memorization' + +system_prompt_template = """Please act as an impartial judge, comparing the responses of the AI assistants to the reference answer and determining if the answers are correct. +You will receive the reference answer provided by a human and the responses of the AI assistants. +Your task is to judge whether the AI assistant's answers is correct. +{task_specific_prompt} +After providing your explanation, strictly output your final judgment in the following format: “[正确]” if the AI assistant's response is correct, “[错误]” if the AI assistant's response is incorrect. +""" + +task_specific_prompts = { + 'Chinese_Anachronisms_Judgment': + "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list.", + 'Chinese_Time_Understanding': + "When evaluating the AI assistant's response regarding Chinese solar terms, as long as the AI assistant's response falls within the time frame provided in the reference answer, consider it correct.", + 'Chinese_Sport_Understanding': + "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list." +} + +judge_system_prompts = { + k: system_prompt_template.format(task_specific_prompt=v) + for k, v in task_specific_prompts.items() +} diff --git a/configs/eval_charm_mem.py b/configs/eval_charm_mem.py new file mode 100644 index 00000000..9703a336 --- /dev/null +++ b/configs/eval_charm_mem.py @@ -0,0 +1,94 @@ +from mmengine.config import read_base + +from opencompass.models import OpenAI +from opencompass.runners import LocalRunner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import CharmMemSummarizer + +with read_base(): + from .datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets + + # ------>>>>>> https://arxiv.org/abs/2403.14112 + # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model + # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model + # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model + # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model + # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model + # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model + # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model + # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model + # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 + # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model + # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model + # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model + # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model + # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model + # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model + # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model + # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model + # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model + # <<<<<<------ https://arxiv.org/abs/2403.14112 + + # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model + # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model + # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model + # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model + + # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model + + # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model + + # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model + # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model + + # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model + + # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model + # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model + # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model + # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +## ------------- JudgeLLM Configuration +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) +judge_models = [ + dict( + abbr='GPT-3.5-turbo-0125', + type=OpenAI, + path='gpt-3.5-turbo-0125', + key='ENV', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8, + temperature=0, + ) +] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + max_task_size=1000, + mode='singlescore', + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=2, + task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict(type=CharmMemSummarizer) + +work_dir = './outputs/CHARM_mem/chat/' diff --git a/configs/eval_charm.py b/configs/eval_charm_rea.py similarity index 62% rename from configs/eval_charm.py rename to configs/eval_charm_rea.py index c1b65ccc..68e132c8 100644 --- a/configs/eval_charm.py +++ b/configs/eval_charm_rea.py @@ -2,35 +2,55 @@ from mmengine.config import read_base with read_base(): from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets - from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model - # from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model - # from models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model - - # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model - # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model - # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model - # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model - # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # ------>>>>>> https://arxiv.org/abs/2403.14112 + # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model + # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model - # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model - - # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model - # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model - # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model - - # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model - - # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model + # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model + # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model + # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model + # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 + # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model + # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model + # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model + # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model + # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model + # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model + # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model + # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model + # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model + # <<<<<<------ https://arxiv.org/abs/2403.14112 - # from .models.hf_llama.hf_llama3_8b_instruct import models as llama3_8b_instruct_model - # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_70b_instruct_model - from .summarizers.charm_rea import summarizer + # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model + # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model + # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model + # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model + + # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model + + # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model + + # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model + # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model + + # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model + + # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model + # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model + # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model + # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model + + from .summarizers.charm_reason import summarizer models = sum([v for k, v in locals().items() if k.endswith('_model')], []) -work_dir = './outputs/CHARM/chat/' +work_dir = './outputs/CHARM_rea/chat/' # dataset version metric mode internlm2-chat-7b-turbomind # ------------------------------------------------------------- --------- ------------- ------ ----------------------------- diff --git a/configs/models/gemini/gemini_1_5_flash.py b/configs/models/gemini/gemini_1_5_flash.py new file mode 100644 index 00000000..79eea6a7 --- /dev/null +++ b/configs/models/gemini/gemini_1_5_flash.py @@ -0,0 +1,22 @@ +from opencompass.models import Gemini + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='gemini-1.5-flash', + type=Gemini, + path='gemini-1.5-flash', + key= + 'ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=15, + max_out_len=100, + max_seq_len=2048, + batch_size=1, + temperature=1, + ) +] diff --git a/configs/models/gemini/gemini_1_5_pro.py b/configs/models/gemini/gemini_1_5_pro.py new file mode 100644 index 00000000..1734849b --- /dev/null +++ b/configs/models/gemini/gemini_1_5_pro.py @@ -0,0 +1,22 @@ +from opencompass.models import Gemini + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='gemini-1.5-pro', + type=Gemini, + path='gemini-1.5-pro', + key= + 'ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=2, + max_out_len=100, + max_seq_len=2048, + batch_size=1, + temperature=1, + ) +] diff --git a/configs/models/gemini/gemini_pro.py b/configs/models/gemini/gemini_pro.py index 871e02b6..ff2124bc 100644 --- a/configs/models/gemini/gemini_pro.py +++ b/configs/models/gemini/gemini_pro.py @@ -12,8 +12,7 @@ models = [ dict(abbr='gemini', type=Gemini, path='gemini-pro', - key='your keys', # The key will be obtained from Environment, but you can write down your key here as well - url = 'your url', + key='ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well meta_template=api_meta_template, query_per_second=16, max_out_len=100, diff --git a/configs/models/openai/gpt_3_5_turbo_0125.py b/configs/models/openai/gpt_3_5_turbo_0125.py new file mode 100644 index 00000000..efab611b --- /dev/null +++ b/configs/models/openai/gpt_3_5_turbo_0125.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAI + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='GPT-3.5-turbo-0125', + type=OpenAI, + path='gpt-3.5-turbo-0125', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/configs/models/openai/gpt_4o_2024_05_13.py b/configs/models/openai/gpt_4o_2024_05_13.py new file mode 100644 index 00000000..e742f8a4 --- /dev/null +++ b/configs/models/openai/gpt_4o_2024_05_13.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAI + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='GPT-4o-2024-05-13', + type=OpenAI, + path='gpt-4o-2024-05-13', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/opencompass/datasets/charm.py b/opencompass/datasets/charm.py index 652b0842..62e4ff04 100644 --- a/opencompass/datasets/charm.py +++ b/opencompass/datasets/charm.py @@ -1,12 +1,14 @@ import json import os.path as osp import re +from typing import List, Union from datasets import Dataset -from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.openicl.icl_evaluator import BaseEvaluator, LMEvaluator from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS) +from opencompass.utils import build_dataset_from_cfg from .base import BaseDataset @@ -44,6 +46,102 @@ class CharmReasonEvaluator(BaseEvaluator): return {'score': score, 'details': details} +UNCERTAIN_LIST = ['不确定', '无法确定', '无法回答', '不知道', '不认识'] + + +def charm_memory_eval(pred: str, ref: Union[str, List[str]]) -> str: + + for uncertain in UNCERTAIN_LIST: + if uncertain in pred: + return '[错误]' + + is_negative = False + if isinstance(ref, str): + if ref.startswith('[not]'): + # 部分CHARM记忆题目的ref是"[not]xxx" + # 即xxx是一个负例,pred中不应该出现xxx + # 例如:https://github.com/opendatalab/CHARM/blob/v1.0/data/CHARM/memorization/Chinese_Movie_and_Music_Recommendation.json#L45 + is_negative = True + + ref = ref[5:] # 去掉[not],保留xxx + references = [ref] + else: + references = ref # 部分CHARM记忆题目的ref是List[str] + assert isinstance(references, list) + + for r in references: + if r in pred: # pred中包含ref + if is_negative: + return '[错误]' + else: + return '[正确]' + + if is_negative: # 已验证pred中不包含ref,且ref是负例,所以pred是正确的 + return '[正确]' + else: + return '[错误]' + + +class CharmMemoryEvaluator(LMEvaluator): + """本Evaluator是基于规则评判CHARM记忆题目的回答是否正确, + 只用于Chinese_Movie_and_Music_Recommendation这一个任务的评判。 + 由于CHARM其他的记忆任务需要使用LLM作为judge(使用LMEvaluator),因而整个eval使用的是SubjectiveEvalTask。 + 因此,本Evaluator的输入输出与LMEvaluator一致。""" + + def __init__(self, prompt_template=None, *nargs, **kwargs): + + if prompt_template is None: + prompt_template = dict( + type='PromptTemplate', + template=dict( + round=[dict(role='HUMAN', prompt='')])) # useless + + super().__init__(prompt_template, *nargs, **kwargs) + + def score(self, predictions, references, **kwargs): + + assert isinstance(predictions, dict) # single-model scoring + references = [{} for _ in range(len(predictions[0]['model_preds'])) + ] if references is None else references + predictions = predictions['model_preds'] + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + + eval_results = [ + charm_memory_eval(pred, ref) + for pred, ref in zip(predictions, references) + ] + + dataset = None + if self.dataset_cfg: + dataset = build_dataset_from_cfg(self.dataset_cfg) + + output = dict() + for i in range(len(predictions)): + if dataset is not None: + question = '' + for col in dataset.reader.input_columns: + question += dataset.reader['test'][col][i] + '\n' + output[str(i)] = { + 'origin_prompt': [{ + 'role': + 'HUMAN', + 'prompt': + f"[Question]: {question}[Assistant's Answer]: {predictions[i]}" # noqa + }], + 'prediction': + eval_results[i], + 'gold': + references[i], + } + + return output + + @LOAD_DATASET.register_module() class CharmDataset(BaseDataset): diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index 0e9b089f..7695b218 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -1,5 +1,6 @@ # flake8: noqa: E501 import json +import os import time from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -48,7 +49,18 @@ class Gemini(BaseAPIModel): query_per_second=query_per_second, meta_template=meta_template, retry=retry) - self.url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={key}' + assert isinstance(key, str) + if key == 'ENV': + if 'GEMINI_API_KEY' not in os.environ: + raise ValueError('GEMINI API key is not set.') + key = os.getenv('GEMINI_API_KEY') + + assert path in [ + 'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash', + 'gemini-1.5-pro' + ] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations + + self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}' self.temperature = temperature self.top_p = top_p self.top_k = top_k @@ -171,17 +183,20 @@ class Gemini(BaseAPIModel): str(raw_response.content)) time.sleep(1) continue - if raw_response.status_code == 200 and response['msg'] == 'ok': - body = response['body'] - if 'candidates' not in body: + if raw_response.status_code == 200: + if 'candidates' not in response: self.logger.error(response) else: - if 'content' not in body['candidates'][0]: + if 'content' not in response['candidates'][0]: return "Due to Google's restrictive policies, I am unable to respond to this question." else: - return body['candidates'][0]['content']['parts'][0][ - 'text'].strip() - self.logger.error(response['msg']) + return response['candidates'][0]['content']['parts'][ + 0]['text'].strip() + try: + msg = response['error']['message'] + self.logger.error(msg) + except KeyError: + pass self.logger.error(response) time.sleep(1) diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 67e19e64..f6093566 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -3,6 +3,7 @@ from .alignmentbench import AlignmentBenchSummarizer from .all_obj import AllObjSummarizer from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer +from .charm import CharmMemSummarizer from .compass_arena import CompassArenaSummarizer from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer diff --git a/opencompass/summarizers/subjective/charm.py b/opencompass/summarizers/subjective/charm.py new file mode 100644 index 00000000..c9c3fed6 --- /dev/null +++ b/opencompass/summarizers/subjective/charm.py @@ -0,0 +1,208 @@ +# flake8: noqa: E501 +import csv +import json +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import mmengine +import numpy as np +import pandas as pd +from mmengine import ConfigDict +from prettytable import from_csv + +from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, + model_abbr_from_cfg) + +from .utils import get_outdir + + +def post_process_charm_mem(judgement: str): + """Input a string like below: + + xxx[correct]xxx, and extract the judge + """ + pattern = r'(?i)\[(incorrect|correct|正确|错误|Yes|No)\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + content = matched_result[0].lower() + if content in ['correct', '正确', 'yes']: + return {'correct': True} + elif content in ['incorrect', '错误', 'no']: + return {'correct': False} + else: + return None + + +def get_judgeanswer_and_reference_charm_mem(dataset, subdir_path, + post_process): + """Extract judgements (scores), references and original judging prompts. + + Args: + dataset (ConfigDict): Dataset config. + subdir_path (str): Model path in results dir. + post_process (function): The pre-defined extract function. + """ + dataset_abbr = dataset_abbr_from_cfg(dataset) + filename = osp.join(subdir_path, dataset_abbr + '.json') + partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json') + if osp.exists(osp.realpath(filename)): + result = mmengine.load(filename) + elif osp.exists(osp.realpath(partial_filename)): + filename = partial_filename + result = {} + i = 1 + partial_dict_flag = 0 + while osp.exists(osp.realpath(filename)): + res = mmengine.load(filename) + for k, v in res.items(): + result[partial_dict_flag] = v + partial_dict_flag += 1 + filename = osp.join(subdir_path, + dataset_abbr + '_' + str(i) + '.json') + i += 1 + else: + result = {} + + if len(result) == 0: + print('*' * 100) + print('There are no results for ' + filename + ' or ' + + partial_filename) + print('*' * 100) + assert len(result) > 0 + + judging_prompts = [] + judged_answers = [] + references = [] + for k, v in result.items(): + processed_judge = post_process(v['prediction']) + if processed_judge is not None: + judged_answers.append(processed_judge) + references.append(v['gold']) + judging_origin_prompts = v['origin_prompt'] + if len(judging_origin_prompts) > 0: + judging_prompts.append(judging_origin_prompts[0].get( + 'prompt', None)) + if len(judged_answers) != len(result): + print( + f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!' + ) + if len(judged_answers) == 0: + print('*' * 100) + print( + 'There are no extracted judgements, please change your judge model or check your prompt!!!' + ) + print('*' * 100) + assert len(judged_answers) > 0 + return judged_answers, references, judging_prompts + + +def get_accuracy(judged_answers): + n_total = 0 + n_correct = 0 + for ans in judged_answers: + if ans.get('correct', False): + n_correct += 1 + n_total += 1 + + return round(n_correct / n_total * 100, 2) + + +class CharmMemSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single') -> None: + self.judge_type = judge_type + self.tasks = [] + self.cfg = config + if self.judge_type == 'single': + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] + else: + raise NotImplementedError + + self.judge_abbr = model_abbr_from_cfg( + self.cfg['eval']['partitioner']['judge_models'][0]) + self.judge_map = {'single': post_process_charm_mem} + self.judge_function = self.judge_map[self.judge_type] + + def summarize(self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + if self.judge_type == 'single': + dataset_cfgs = self.cfg['datasets'] + judge_model = self.judge_abbr + output_dir, results_folder = get_outdir(self.cfg, time_str) + + accuracy_df = pd.DataFrame(columns=self.eval_model_abbrs) + for dataset in dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + dataset_instance = build_dataset_from_cfg(dataset) + out_dir = osp.join( + output_dir, + 'judged-by--' + judge_model + '-' + dataset_abbr) + os.makedirs(out_dir, exist_ok=True) + + cur_acc_dict = {'dataset': dataset_abbr} + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + model = eval_model_abbr + (judged_answers, references, judging_prompts + ) = get_judgeanswer_and_reference_charm_mem( + dataset, + subdir_path, + self.judge_function, + ) + accuracy = get_accuracy(judged_answers) + cur_acc_dict[eval_model_abbr] = accuracy + + detail_dict = {} + for i in range(len(judged_answers)): + cur_dict = {} + cur_dict['judging_prompt'] = judging_prompts[i] + for input_col in dataset_instance.reader.input_columns: + cur_dict[input_col] = dataset_instance.reader[ + 'test'][input_col][i] + cur_dict['reference'] = references[i] + cur_dict.update(judged_answers[i]) + + detail_dict[str(i)] = cur_dict + + out_dict = {'score': accuracy, 'details': detail_dict} + fout = osp.join(out_dir, model + '.json') + with open(fout, 'w', encoding='utf-8') as f: + json.dump(out_dict, + f, + indent=4, + ensure_ascii=False) + else: + print(subdir_path + ' is not exist! please check!') + + accuracy_df = accuracy_df.append(cur_acc_dict, + ignore_index=True) + accuracy_df.set_index('dataset', inplace=True) + + accuracy_file = osp.join(output_dir, + 'judged-by--' + judge_model + '.csv') + accuracy_df.to_csv(accuracy_file, index=True) + with open(accuracy_file, 'r') as f: + x = from_csv(f) + print(x) From edab1c07baaebe3e6b6b48983d56656f6c93fd76 Mon Sep 17 00:00:00 2001 From: "Xingjun.Wang" Date: Mon, 29 Jul 2024 13:48:32 +0800 Subject: [PATCH 2/2] [Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin Co-authored-by: Yunnglin Co-authored-by: Yun lin Co-authored-by: Yunnglin Co-authored-by: zhangsongyang --- .gitignore | 2 +- .pre-commit-config.yaml | 1 + README.md | 18 ++ README_zh-CN.md | 17 + configs/datasets/ARC_c/ARC_c_clean_ppl.py | 3 +- configs/datasets/ARC_c/ARC_c_gen_1e0de5.py | 3 +- configs/datasets/ARC_c/ARC_c_ppl_2ef631.py | 3 +- configs/datasets/ARC_c/ARC_c_ppl_a450bd.py | 3 +- configs/datasets/ARC_c/ARC_c_ppl_d52a21.py | 5 +- configs/datasets/ARC_e/ARC_e_gen_1e0de5.py | 3 +- configs/datasets/ARC_e/ARC_e_ppl_2ef631.py | 3 +- configs/datasets/ARC_e/ARC_e_ppl_a450bd.py | 3 +- configs/datasets/ARC_e/ARC_e_ppl_d52a21.py | 3 +- .../CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py | 2 +- .../CLUE_CMRC/CLUE_CMRC_gen_3749cd.py | 2 +- .../CLUE_CMRC/CLUE_CMRC_gen_8484b9.py | 2 +- .../CLUE_CMRC/CLUE_CMRC_gen_941108.py | 2 +- .../CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py | 2 +- .../CLUE_DRCD/CLUE_DRCD_gen_3749cd.py | 2 +- .../CLUE_DRCD/CLUE_DRCD_gen_8484b9.py | 2 +- .../CLUE_DRCD/CLUE_DRCD_gen_941108.py | 2 +- .../CLUE_afqmc/CLUE_afqmc_gen_901306.py | 6 +- .../CLUE_cmnli/CLUE_cmnli_gen_1abf97.py | 6 +- .../CLUE_cmnli/CLUE_cmnli_gen_51e956.py | 6 +- .../CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py | 6 +- .../CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py | 6 +- .../CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py | 6 +- .../CLUE_ocnli/CLUE_ocnli_gen_51e956.py | 6 +- .../CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py | 6 +- configs/datasets/ChemBench/ChemBench_gen.py | 2 +- .../FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py | 8 +- .../FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py | 6 +- .../FewCLUE_cluewsc_gen_c68933.py | 6 +- .../FewCLUE_csl/FewCLUE_csl_gen_28b223.py | 6 +- .../FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py | 6 +- .../FewCLUE_eprstmt_gen_740ea0.py | 6 +- .../FewCLUE_ocnli_fc_gen_f97a97.py | 8 +- .../FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py | 6 +- .../GaokaoBench/GaokaoBench_gen_5cfe9e.py | 2 + .../GaokaoBench/GaokaoBench_mixed_9af5ee.py | 3 +- .../GaokaoBench_no_subjective_gen_4c31db.py | 1 + .../GaokaoBench_no_subjective_gen_d21e37.py | 1 + .../NPHardEval/NPHardEval_gen_22aac5.py | 36 +-- .../OpenFinData/{OpenFinData.md => README.md} | 0 .../SuperGLUE_AX_b_gen_4dfefa.py | 4 +- .../SuperGLUE_AX_g_gen_68aac7.py | 4 +- .../SuperGLUE_BoolQ_gen_883d50.py | 4 +- .../SuperGLUE_BoolQ_ppl_314797.py | 4 +- .../SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py | 4 +- .../SuperGLUE_COPA_gen_91ca53.py | 4 +- .../SuperGLUE_MultiRC_gen_27071f.py | 4 +- .../SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py | 4 +- .../SuperGLUE_ReCoRD_gen_a69961.py | 4 +- .../SuperGLUE_WSC/SuperGLUE_WSC_gen_7902a7.py | 4 +- .../SuperGLUE_WSC/SuperGLUE_WSC_gen_fe4bf3.py | 4 +- .../SuperGLUE_WSC/SuperGLUE_WSC_ppl_1c4a90.py | 4 +- .../SuperGLUE_WSC/SuperGLUE_WSC_ppl_d0f531.py | 4 +- .../SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py | 4 +- configs/datasets/Xsum/Xsum_gen_31397e.py | 2 +- configs/datasets/Xsum/Xsum_gen_8ea5f8.py | 2 +- .../adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py | 2 +- .../adv_glue_mnli_mm_gen_bd8ef0.py | 2 +- .../adv_glue_qnli/adv_glue_qnli_gen_0b7326.py | 2 +- .../adv_glue_qqp/adv_glue_qqp_gen_cdc277.py | 2 +- .../adv_glue_rte/adv_glue_rte_gen_8cc547.py | 2 +- .../adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py | 2 +- .../datasets/agieval/agieval_gen_397d81.py | 6 +- .../datasets/agieval/agieval_gen_617738.py | 6 +- .../datasets/agieval/agieval_gen_64afd3.py | 6 +- .../datasets/agieval/agieval_gen_a0c741.py | 4 +- .../datasets/agieval/agieval_mixed_0fa998.py | 6 +- configs/datasets/bbh/bbh_gen_2879b0.py | 2 +- configs/datasets/bbh/bbh_gen_5b92b0.py | 4 +- configs/datasets/bbh/bbh_gen_5bf00b.py | 4 +- configs/datasets/bbh/bbh_gen_98fba6.py | 4 +- configs/datasets/ceval/ceval_clean_ppl.py | 3 +- configs/datasets/ceval/ceval_gen_2daf24.py | 2 +- configs/datasets/ceval/ceval_gen_5f30c7.py | 3 +- .../ceval/ceval_internal_ppl_1cd8bf.py | 3 +- configs/datasets/ceval/ceval_ppl_1cd8bf.py | 3 +- configs/datasets/ceval/ceval_ppl_578f8d.py | 3 +- configs/datasets/ceval/ceval_ppl_93e5ce.py | 3 +- .../ceval/ceval_zero_shot_gen_bd40ef.py | 3 +- .../clozeTest_maxmin_gen_c205fb.py | 4 +- configs/datasets/cmmlu/cmmlu_gen_c13365.py | 3 +- configs/datasets/cmmlu/cmmlu_ppl_041cbf.py | 2 +- configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py | 2 +- .../commonsenseqa/commonsenseqa_gen_1da2d0.py | 2 +- .../commonsenseqa/commonsenseqa_gen_c946f2.py | 2 +- .../commonsenseqa/commonsenseqa_ppl_3e9f2d.py | 2 +- .../commonsenseqa/commonsenseqa_ppl_5545e2.py | 2 +- .../commonsenseqa/commonsenseqa_ppl_716f78.py | 2 +- .../commonsenseqa/commonsenseqa_ppl_c49e77.py | 2 +- .../commonsenseqa/commonsenseqa_ppl_e51e32.py | 2 +- .../compassbench_v1_knowledge_gen_bd74e0.py | 4 +- .../compassbench_v1_knowledge_gen_bd74e0.py | 4 +- .../mbpp_contamination_ppl_f01cb6.py | 2 +- .../crowspairs/crowspairs_gen_02b6c1.py | 4 +- .../crowspairs/crowspairs_gen_381af0.py | 4 +- .../crowspairs/crowspairs_ppl_47f211.py | 4 +- .../crowspairs/crowspairs_ppl_e811e1.py | 4 +- .../crowspairs_cn/crowspairscn_gen_556dc9.py | 4 +- .../crowspairs_cn/crowspairscn_ppl_f53575.py | 4 +- configs/datasets/flores/flores_gen_806ede.py | 2 +- configs/datasets/flores/flores_gen_aad4fd.py | 2 +- .../gpqa_openai_simple_evals_gen_5aeece.py | 4 +- .../datasets/gsm8k/gsm8k_0shot_gen_a58960.py | 2 +- .../gsm8k/gsm8k_0shot_v2_gen_a58960.py | 2 +- .../datasets/gsm8k/gsm8k_agent_gen_c3dff3.py | 2 +- configs/datasets/gsm8k/gsm8k_gen_17d0dc.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_1dce88.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_3309bd.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_57b0b1.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_701491.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_a3e34a.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_d6de81.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_e9e91e.py | 3 +- configs/datasets/gsm8k/gsm8k_gen_ee684f.py | 3 +- .../hellaswag/hellaswag_10shot_gen_e42710.py | 6 +- .../hellaswag/hellaswag_10shot_ppl_59c85e.py | 6 +- .../datasets/hellaswag/hellaswag_clean_ppl.py | 6 +- .../hellaswag/hellaswag_gen_6faab5.py | 6 +- .../hellaswag/hellaswag_ppl_47bff9.py | 6 +- .../hellaswag/hellaswag_ppl_7d7f2d.py | 6 +- .../hellaswag/hellaswag_ppl_9dbb12.py | 6 +- .../hellaswag/hellaswag_ppl_a6e128.py | 6 +- .../deprecated_humaneval_gen_4a6eef.py | 2 +- .../deprecated_humaneval_gen_6d1cc2.py | 2 +- .../deprecated_humaneval_gen_a82cae.py | 2 +- .../deprecated_humaneval_gen_d2537e.py | 2 +- .../deprecated_humaneval_gen_fd5822.py | 2 +- .../deprecated_humaneval_gen_ff7054.py | 2 +- .../humaneval/humaneval_gen_66a7f4.py | 2 +- .../humaneval/humaneval_gen_8e312c.py | 2 +- ...umaneval_openai_sample_evals_gen_159614.py | 2 +- .../humaneval/humaneval_passk_gen_8e312c.py | 2 +- .../humaneval_repeat10_gen_8e312c.py | 2 +- .../humaneval_cn/humaneval_cn_gen_6313aa.py | 2 +- .../humaneval_cn_passk_gen_6313aa.py | 2 +- .../humaneval_cn_repeat10_gen_6313aa.py | 2 +- .../humaneval_plus_gen_66a7f4.py | 2 +- .../humaneval_plus_gen_8e312c.py | 2 +- .../humaneval_plus_passk_gen_8e312c.py | 2 +- .../humaneval_plus_repeat10_gen_8e312c.py | 2 +- .../datasets/lambada/lambada_gen_217e11.py | 2 +- .../datasets/lambada/lambada_gen_8b48a5.py | 2 +- configs/datasets/lcsts/lcsts_gen_8ee1fe.py | 2 +- configs/datasets/lcsts/lcsts_gen_9b0b89.py | 2 +- .../datasets/math/math_0shot_gen_393424.py | 2 +- .../math/math_4shot_base_gen_db136b.py | 2 +- .../math/math_agent_evaluatorv2_gen_0c1b4e.py | 3 +- .../datasets/math/math_agent_gen_0c1b4e.py | 3 +- .../datasets/math/math_agent_gen_861b4f.py | 3 +- .../datasets/math/math_agent_gen_af2293.py | 3 +- .../math/math_evaluatorv2_gen_2f4a71.py | 2 +- .../math/math_evaluatorv2_gen_cecb31.py | 2 +- configs/datasets/math/math_gen_0957ff.py | 2 +- configs/datasets/math/math_gen_1ed9c2.py | 2 +- configs/datasets/math/math_gen_265cce.py | 2 +- configs/datasets/math/math_gen_559593.py | 2 +- configs/datasets/math/math_gen_5e8458.py | 2 +- configs/datasets/math/math_gen_736506.py | 2 +- configs/datasets/math/math_gen_78ced2.py | 2 +- configs/datasets/math/math_gen_943d32.py | 2 +- .../math/math_intern_evaluator_gen_265cce.py | 2 +- configs/datasets/math/math_llm_judge.py | 2 +- .../mbpp/deprecated_mbpp_passk_gen_1e1056.py | 4 +- .../deprecated_mbpp_repeat10_gen_1e1056.py | 4 +- configs/datasets/mbpp/mbpp_gen_830460.py | 2 +- .../datasets/mbpp/mbpp_passk_gen_830460.py | 6 +- .../datasets/mbpp/mbpp_repeat10_gen_830460.py | 6 +- .../mbpp/sanitized_mbpp_gen_742f0c.py | 2 +- .../mbpp/sanitized_mbpp_gen_830460.py | 2 +- .../mbpp/sanitized_mbpp_gen_a0fc46.py | 2 +- .../mbpp/sanitized_mbpp_mdblock_gen_a447ff.py | 2 +- .../mbpp/sanitized_mbpp_passk_gen_830460.py | 2 +- .../sanitized_mbpp_repeat10_gen_830460.py | 2 +- .../deprecated_mbpp_cn_passk_gen_1d1481.py | 4 +- .../deprecated_mbpp_cn_repeat10_gen_1d1481.py | 4 +- .../datasets/mbpp_cn/mbpp_cn_gen_9114d5.py | 1 + configs/datasets/mmlu/mmlu_clean_ppl.py | 2 +- configs/datasets/mmlu/mmlu_gen_23a9a9.py | 2 +- configs/datasets/mmlu/mmlu_gen_4d595a.py | 2 +- configs/datasets/mmlu/mmlu_gen_5d1409.py | 2 +- configs/datasets/mmlu/mmlu_gen_79e572.py | 2 +- configs/datasets/mmlu/mmlu_gen_a484b3.py | 2 +- .../mmlu_openai_simple_evals_gen_b618ea.py | 4 +- configs/datasets/mmlu/mmlu_ppl_ac766d.py | 2 +- .../mmlu/mmlu_zero_shot_gen_47e2c0.py | 2 +- configs/datasets/nq/nq_gen_0356ec.py | 2 +- configs/datasets/nq/nq_gen_2463e2.py | 2 +- configs/datasets/nq/nq_gen_3dcea1.py | 2 +- configs/datasets/nq/nq_gen_68c1c6.py | 2 +- configs/datasets/nq/nq_gen_c788f6.py | 2 +- configs/datasets/nq_cn/nqcn_gen_141737.py | 6 +- configs/datasets/obqa/obqa_gen_9069e4.py | 6 +- configs/datasets/obqa/obqa_ppl_1defe8.py | 6 +- configs/datasets/obqa/obqa_ppl_6aac9e.py | 7 +- configs/datasets/obqa/obqa_ppl_c7c154.py | 6 +- configs/datasets/piqa/piqa_gen_1194eb.py | 6 +- configs/datasets/piqa/piqa_ppl_0cfff2.py | 6 +- configs/datasets/piqa/piqa_ppl_1cf9f0.py | 6 +- configs/datasets/piqa/piqa_ppl_3431ea.py | 6 +- .../promptbench_math_gen_abf776.py | 2 +- configs/datasets/py150/py150_gen_38b13d.py | 1 + configs/datasets/race/race_gen_69ee4f.py | 4 +- configs/datasets/race/race_gen_9302a5.py | 4 +- configs/datasets/race/race_ppl_5831a0.py | 4 +- configs/datasets/race/race_ppl_a138cd.py | 4 +- configs/datasets/race/race_ppl_abed12.py | 4 +- configs/datasets/s3eval/s3eval_gen.py | 2 +- ...val_gen_370cc2.py => s3eval_gen_b8ac80.py} | 2 +- configs/datasets/siqa/siqa_gen_18632c.py | 6 +- configs/datasets/siqa/siqa_gen_e78df3.py | 2 +- configs/datasets/siqa/siqa_ppl_42bc6e.py | 2 +- configs/datasets/siqa/siqa_ppl_7845b0.py | 2 +- configs/datasets/siqa/siqa_ppl_ced5f6.py | 2 +- configs/datasets/siqa/siqa_ppl_e8d8c5.py | 2 +- .../storycloze/storycloze_gen_7f656a.py | 6 +- .../storycloze/storycloze_ppl_496661.py | 6 +- .../storycloze/storycloze_ppl_afd16f.py | 6 +- .../strategyqa/strategyqa_gen_1180a7.py | 2 +- .../strategyqa/strategyqa_gen_934441.py | 2 +- .../summedits/summedits_gen_315438.py | 2 +- .../summedits/summedits_gen_4fb38b.py | 2 +- .../datasets/triviaqa/triviaqa_gen_0356ec.py | 3 +- .../datasets/triviaqa/triviaqa_gen_2121ce.py | 2 +- .../datasets/triviaqa/triviaqa_gen_3e39a5.py | 2 +- .../datasets/triviaqa/triviaqa_gen_429db5.py | 2 +- .../datasets/triviaqa/triviaqa_gen_d297bb.py | 2 +- .../triviaqa_wiki_1shot_gen_20a989.py | 6 +- .../triviaqa_wiki_1shot_gen_bc5f21.py | 6 +- .../triviaqa_wiki_1shot_gen_eaf81e.py | 6 +- .../triviaqa/triviaqa_wiki_gen_d18bf4.py | 6 +- configs/datasets/tydiqa/tydiqa_gen_978d2a.py | 7 +- .../datasets/winograd/winograd_ppl_8f3049.py | 4 +- .../datasets/winograd/winograd_ppl_b6c7ed.py | 4 +- .../deprecated_winogrande_gen_a9ede5.py | 4 +- .../winogrande/winogrande_5shot_gen_6447e6.py | 6 +- .../winogrande/winogrande_5shot_gen_b36770.py | 6 +- .../winogrande/winogrande_5shot_ll_252f01.py | 6 +- .../winogrande/winogrande_gen_458220.py | 6 +- .../winogrande/winogrande_gen_a027b6.py | 6 +- .../winogrande/winogrande_ll_c5cf57.py | 6 +- .../winogrande/winogrande_ppl_55a66e.py | 6 +- .../winogrande/winogrande_ppl_9307fd.py | 6 +- configs/datasets/z_bench/z_bench_gen.py | 4 - .../datasets/z_bench/z_bench_gen_52ba2f.py | 25 -- .../datasets/z_bench/z_bench_gen_d8c84c.py | 28 -- configs/eval_modelscope_datasets.py | 82 +++++ docs/en/advanced_guides/code_eval.md | 8 +- docs/en/get_started/installation.md | 14 +- docs/en/user_guides/datasets.md | 2 +- docs/zh_cn/advanced_guides/code_eval.md | 8 +- docs/zh_cn/get_started/installation.md | 12 +- docs/zh_cn/user_guides/datasets.md | 2 +- opencompass/datasets/FinanceIQ.py | 2 + opencompass/datasets/GaokaoBench.py | 15 +- opencompass/datasets/IFEval/ifeval.py | 2 + opencompass/datasets/MMLUArabic.py | 2 + opencompass/datasets/NPHardEval/cmp_GCP_D.py | 6 +- opencompass/datasets/NPHardEval/cmp_KSP.py | 6 +- opencompass/datasets/NPHardEval/cmp_TSP_D.py | 6 +- opencompass/datasets/NPHardEval/hard_GCP.py | 6 +- opencompass/datasets/NPHardEval/hard_MSP.py | 6 +- opencompass/datasets/NPHardEval/hard_TSP.py | 6 +- opencompass/datasets/NPHardEval/p_BSP.py | 6 +- opencompass/datasets/NPHardEval/p_EDP.py | 6 +- opencompass/datasets/NPHardEval/p_SPP.py | 6 +- opencompass/datasets/OpenFinData.py | 2 + opencompass/datasets/QuALITY.py | 2 + opencompass/datasets/TheoremQA/legacy.py | 2 + opencompass/datasets/TheoremQA/main.py | 2 + opencompass/datasets/__init__.py | 1 + opencompass/datasets/advglue.py | 2 + opencompass/datasets/afqmcd.py | 26 +- opencompass/datasets/agieval/agieval.py | 60 +++- .../datasets/agieval/dataset_loader.py | 23 +- opencompass/datasets/arc.py | 131 ++++++-- opencompass/datasets/ax.py | 33 +- opencompass/datasets/bbh.py | 13 +- opencompass/datasets/boolq.py | 7 +- opencompass/datasets/cb.py | 4 +- opencompass/datasets/ceval.py | 113 ++++--- opencompass/datasets/chid.py | 7 +- opencompass/datasets/circular.py | 18 +- opencompass/datasets/clozeTest_maxmin.py | 3 + opencompass/datasets/cluewsc.py | 7 +- opencompass/datasets/cmb.py | 2 + opencompass/datasets/cmmlu.py | 55 +++- opencompass/datasets/cmnli.py | 59 +++- opencompass/datasets/cmrc.py | 2 + opencompass/datasets/commonsenseqa.py | 73 +++-- opencompass/datasets/commonsenseqa_cn.py | 3 + opencompass/datasets/copa.py | 4 +- opencompass/datasets/crowspairs.py | 4 +- opencompass/datasets/crowspairs_cn.py | 6 +- opencompass/datasets/csl.py | 7 +- opencompass/datasets/cvalues.py | 3 +- opencompass/datasets/drcd.py | 2 + opencompass/datasets/drop_simple_eval.py | 2 + opencompass/datasets/ds1000.py | 2 + opencompass/datasets/eprstmt.py | 4 +- opencompass/datasets/flames.py | 2 + opencompass/datasets/flores.py | 42 ++- opencompass/datasets/game24.py | 2 + opencompass/datasets/govrepcrs.py | 8 +- opencompass/datasets/gpqa.py | 6 +- opencompass/datasets/gsm8k.py | 28 +- opencompass/datasets/gsm_hard.py | 2 + opencompass/datasets/hellaswag.py | 196 +++++++++--- opencompass/datasets/huggingface.py | 4 + opencompass/datasets/humaneval.py | 25 +- opencompass/datasets/humaneval_multi.py | 2 + opencompass/datasets/humanevalx.py | 2 + opencompass/datasets/hungarian_math.py | 2 + opencompass/datasets/inference_ppl.py | 2 + .../infinitebench/infinitebench_codedebug.py | 2 + .../infinitebench/infinitebench_coderun.py | 2 + .../infinitebench/infinitebench_endia.py | 2 + .../infinitebench/infinitebench_enmc.py | 2 + .../infinitebench/infinitebench_enqa.py | 2 + .../infinitebench/infinitebench_ensum.py | 2 + .../infinitebench/infinitebench_mathcalc.py | 2 + .../infinitebench/infinitebench_mathfind.py | 2 + .../infinitebench/infinitebench_retrievekv.py | 2 + .../infinitebench_retrievenumber.py | 2 + .../infinitebench_retrievepasskey.py | 2 + .../infinitebench/infinitebench_zhqa.py | 2 + opencompass/datasets/jigsawmultilingual.py | 4 + opencompass/datasets/jsonl.py | 2 + opencompass/datasets/kaoshi.py | 2 + opencompass/datasets/lambada.py | 20 +- opencompass/datasets/lawbench/lawbench.py | 2 + opencompass/datasets/lcsts.py | 39 ++- opencompass/datasets/leval/leval_coursera.py | 4 + .../datasets/leval/leval_financial_qa.py | 4 + .../datasets/leval/leval_gov_report_summ.py | 4 + opencompass/datasets/leval/leval_gsm100.py | 4 + .../datasets/leval/leval_legal_contract_qa.py | 4 + .../datasets/leval/leval_meeting_summ.py | 4 + .../datasets/leval/leval_multidoc_qa.py | 4 + .../datasets/leval/leval_narrattive_qa.py | 4 + .../datasets/leval/leval_natural_question.py | 4 + opencompass/datasets/leval/leval_news_summ.py | 4 + .../datasets/leval/leval_paper_assistant.py | 4 + .../datasets/leval/leval_patent_summ.py | 4 + opencompass/datasets/leval/leval_quality.py | 4 + .../datasets/leval/leval_review_summ.py | 4 + .../datasets/leval/leval_scientific_qa.py | 4 + .../datasets/leval/leval_topic_retrieval.py | 4 + opencompass/datasets/leval/leval_tpo.py | 4 + .../datasets/leval/leval_tvshow_summ.py | 4 + opencompass/datasets/llm_compression.py | 2 + .../datasets/longbench/longbench_2wikim_qa.py | 4 + .../datasets/longbench/longbench_dureader.py | 4 + .../longbench/longbench_gov_report.py | 4 + .../datasets/longbench/longbench_hotpot_qa.py | 4 + .../datasets/longbench/longbench_lcc.py | 4 + .../datasets/longbench/longbench_lsht.py | 4 + .../longbench/longbench_multi_news.py | 4 + .../longbench/longbench_multifieldqa_en.py | 4 + .../longbench/longbench_multifieldqa_zh.py | 4 + .../datasets/longbench/longbench_musique.py | 4 + .../longbench/longbench_narrative_qa.py | 4 + .../longbench/longbench_passage_count.py | 4 + .../longbench_passage_retrieval_en.py | 4 + .../longbench_passage_retrieval_zh.py | 4 + .../datasets/longbench/longbench_qasper.py | 4 + .../datasets/longbench/longbench_qmsum.py | 4 + .../datasets/longbench/longbench_repobench.py | 4 + .../datasets/longbench/longbench_samsum.py | 4 + .../datasets/longbench/longbench_trec.py | 4 + .../datasets/longbench/longbench_trivia_qa.py | 4 + .../datasets/longbench/longbench_vcsum.py | 3 + .../datasets/lveval/lveval_cmrc_mixup.py | 4 + .../datasets/lveval/lveval_dureader_mixup.py | 4 + .../datasets/lveval/lveval_factrecall_en.py | 4 + .../datasets/lveval/lveval_factrecall_zh.py | 4 + .../lveval/lveval_hotpotwikiqa_mixup.py | 4 + .../datasets/lveval/lveval_lic_mixup.py | 4 + .../datasets/lveval/lveval_loogle_CR_mixup.py | 4 + .../lveval/lveval_loogle_MIR_mixup.py | 4 + .../datasets/lveval/lveval_loogle_SD_mixup.py | 4 + .../lveval/lveval_multifieldqa_en_mixup.py | 4 + .../lveval/lveval_multifieldqa_zh_mixup.py | 4 + opencompass/datasets/math.py | 30 +- opencompass/datasets/mathbench.py | 2 + opencompass/datasets/mbpp.py | 63 +++- opencompass/datasets/medbench/medbench.py | 2 + opencompass/datasets/mgsm.py | 2 + opencompass/datasets/mmlu.py | 137 +++++--- opencompass/datasets/multirc.py | 5 +- opencompass/datasets/narrativeqa.py | 2 + opencompass/datasets/natural_question.py | 35 +- opencompass/datasets/natural_question_cn.py | 6 +- opencompass/datasets/obqa.py | 77 +++-- opencompass/datasets/piqa.py | 108 +++++-- opencompass/datasets/py150.py | 2 + opencompass/datasets/qasper.py | 2 + opencompass/datasets/qaspercut.py | 2 + opencompass/datasets/race.py | 38 ++- opencompass/datasets/realtoxicprompts.py | 5 +- opencompass/datasets/record.py | 5 +- opencompass/datasets/rolebench.py | 4 + opencompass/datasets/safety.py | 2 + opencompass/datasets/scibench.py | 2 + opencompass/datasets/siqa.py | 105 +++++- opencompass/datasets/squad20.py | 2 + opencompass/datasets/storycloze.py | 50 ++- opencompass/datasets/strategyqa.py | 17 +- opencompass/datasets/summedits.py | 21 +- opencompass/datasets/summscreen.py | 2 + opencompass/datasets/svamp.py | 2 + opencompass/datasets/tabmwp.py | 2 + opencompass/datasets/taco.py | 2 + opencompass/datasets/teval/__init__.py | 2 + opencompass/datasets/tnews.py | 9 +- opencompass/datasets/triviaqa.py | 79 +++-- opencompass/datasets/triviaqarc.py | 2 + opencompass/datasets/tydiqa.py | 27 +- opencompass/datasets/wic.py | 7 +- opencompass/datasets/wikibench.py | 2 + opencompass/datasets/winograd.py | 2 +- opencompass/datasets/winogrande.py | 139 ++++++-- opencompass/datasets/wsc.py | 11 +- opencompass/datasets/xiezhi.py | 2 + opencompass/datasets/xsum.py | 40 ++- opencompass/utils/__init__.py | 1 + opencompass/utils/datasets.py | 301 ++++++++++++++++++ tests/dataset/test_local_datasets.py | 230 +++++++++++++ tests/dataset/test_ms_datasets.py | 223 +++++++++++++ 433 files changed, 3253 insertions(+), 1018 deletions(-) rename configs/datasets/OpenFinData/{OpenFinData.md => README.md} (100%) rename configs/datasets/s3eval/{s3eval_gen_370cc2.py => s3eval_gen_b8ac80.py} (86%) delete mode 100644 configs/datasets/z_bench/z_bench_gen.py delete mode 100644 configs/datasets/z_bench/z_bench_gen_52ba2f.py delete mode 100644 configs/datasets/z_bench/z_bench_gen_d8c84c.py create mode 100644 configs/eval_modelscope_datasets.py create mode 100644 opencompass/utils/datasets.py create mode 100644 tests/dataset/test_local_datasets.py create mode 100644 tests/dataset/test_ms_datasets.py diff --git a/.gitignore b/.gitignore index 8271d6c7..998d4af3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ - +.DS_Store output_*/ outputs/ scripts/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5d9dcd2..d92e9de0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,7 @@ exclude: | (?x)^( tests/data/| + tests/dataset/| opencompass/models/internal/| opencompass/utils/internal/| opencompass/openicl/icl_evaluator/hf_metrics/| diff --git a/README.md b/README.md index 8390c030..1ca6c2ee 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥 - **\[2024.07.17\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥 - **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥 - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥. @@ -136,12 +137,29 @@ pip install -e . ### 📂 Data Preparation +You can download and extract the datasets with the following commands: + ```bash # Download dataset to data/ folder wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip unzip OpenCompassData-core-20240207.zip ``` +Also, use the [ModelScope](www.modelscope.cn) to load the datasets on demand. + +Installation: + +```bash +pip install modelscope +export DATASET_SOURCE=ModelScope +``` + +Then submit the evaluation task without downloading all the data to your local disk. Available datasets include: + +```bash +humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli +``` + Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).

🔝Back to top

diff --git a/README_zh-CN.md b/README_zh-CN.md index 76ea1537..29719579 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -69,6 +69,7 @@ ## 🚀 最新进展 +- **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集,您可以按需加载,无需事先下载全部数据到本地,欢迎试用!🔥🔥🔥 - **\[2024.07.17\]** 我们发布了CompassBench-202408榜单的示例数据和评测规则,敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥 - **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥 - **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5, 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级,欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥. @@ -138,12 +139,28 @@ pip install -e . ### 📂 数据准备 +OpenCompass支持使用本地数据集进行评测,数据集的下载和解压可以通过以下命令完成: + ```bash # 下载数据集到 data/ 处 wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip unzip OpenCompassData-core-20240207.zip ``` +另外,您还可以使用[ModelScope](www.modelscope.cn)来加载数据集: +环境准备: + +```bash +pip install modelscope +export DATASET_SOURCE=ModelScope +``` + +配置好环境后,无需下载全部数据,直接提交评测任务即可。目前支持的数据集有: + +```bash +humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli +``` + 有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。

🔝返回顶部

diff --git a/configs/datasets/ARC_c/ARC_c_clean_ppl.py b/configs/datasets/ARC_c/ARC_c_clean_ppl.py index 09f6328a..75cad1bc 100644 --- a/configs/datasets/ARC_c/ARC_c_clean_ppl.py +++ b/configs/datasets/ARC_c/ARC_c_clean_ppl.py @@ -47,7 +47,8 @@ ARC_c_datasets = [ dict( type=ARCDataset, abbr='ARC-c-test', - path='./data/ARC/ARC-c/ARC-Challenge-Test.jsonl', + path='opencompass/ai2_arc-test', + name='ARC-Challenge', reader_cfg=ARC_c_reader_cfg, infer_cfg=ARC_c_infer_cfg, eval_cfg=ARC_c_eval_cfg) diff --git a/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py b/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py index 9f6314af..56022e14 100644 --- a/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py +++ b/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py @@ -35,7 +35,8 @@ ARC_c_datasets = [ dict( abbr='ARC-c', type=ARCDataset, - path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl', + path='opencompass/ai2_arc-dev', + name='ARC-Challenge', reader_cfg=ARC_c_reader_cfg, infer_cfg=ARC_c_infer_cfg, eval_cfg=ARC_c_eval_cfg, diff --git a/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py b/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py index 814f82a9..2e00b59e 100644 --- a/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py +++ b/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py @@ -29,7 +29,8 @@ ARC_c_datasets = [ dict( type=ARCDataset, abbr='ARC-c', - path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl', + path='opencompass/ai2_arc-dev', + name='ARC-Challenge', reader_cfg=ARC_c_reader_cfg, infer_cfg=ARC_c_infer_cfg, eval_cfg=ARC_c_eval_cfg) diff --git a/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py b/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py index 2b67ed49..b10c00c1 100644 --- a/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py +++ b/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py @@ -46,7 +46,8 @@ ARC_c_datasets = [ dict( type=ARCDataset, abbr='ARC-c', - path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl', + path='opencompass/ai2_arc-dev', + name='ARC-Challenge', reader_cfg=ARC_c_reader_cfg, infer_cfg=ARC_c_infer_cfg, eval_cfg=ARC_c_eval_cfg) diff --git a/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py b/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py index 9bcb1d8f..a3733804 100644 --- a/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py +++ b/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py @@ -1,3 +1,5 @@ +from mmengine.config import read_base +# with read_base(): from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer @@ -26,7 +28,8 @@ ARC_c_datasets = [ dict( type=ARCDataset, abbr='ARC-c', - path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl', + path='opencompass/ai2_arc-dev', + name='ARC-Challenge', reader_cfg=ARC_c_reader_cfg, infer_cfg=ARC_c_infer_cfg, eval_cfg=ARC_c_eval_cfg) diff --git a/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py b/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py index 8f818ef4..22c4ead7 100644 --- a/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py +++ b/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py @@ -35,7 +35,8 @@ ARC_e_datasets = [ dict( abbr='ARC-e', type=ARCDataset, - path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl', + path='opencompass/ai2_arc-easy-dev', + name='ARC-Easy', reader_cfg=ARC_e_reader_cfg, infer_cfg=ARC_e_infer_cfg, eval_cfg=ARC_e_eval_cfg, diff --git a/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py b/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py index 2791ded0..e96d85db 100644 --- a/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py +++ b/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py @@ -29,7 +29,8 @@ ARC_e_datasets = [ dict( type=ARCDataset, abbr='ARC-e', - path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl', + path='opencompass/ai2_arc-easy-dev', + name='ARC-Easy', reader_cfg=ARC_e_reader_cfg, infer_cfg=ARC_e_infer_cfg, eval_cfg=ARC_e_eval_cfg) diff --git a/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py b/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py index 79d350e0..d51accc1 100644 --- a/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py +++ b/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py @@ -46,7 +46,8 @@ ARC_e_datasets = [ dict( type=ARCDataset, abbr='ARC-e', - path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl', + path='opencompass/ai2_arc-easy-dev', + name='ARC-Easy', reader_cfg=ARC_e_reader_cfg, infer_cfg=ARC_e_infer_cfg, eval_cfg=ARC_e_eval_cfg) diff --git a/configs/datasets/ARC_e/ARC_e_ppl_d52a21.py b/configs/datasets/ARC_e/ARC_e_ppl_d52a21.py index 1a89c140..127936af 100644 --- a/configs/datasets/ARC_e/ARC_e_ppl_d52a21.py +++ b/configs/datasets/ARC_e/ARC_e_ppl_d52a21.py @@ -26,7 +26,8 @@ ARC_e_datasets = [ dict( type=ARCDataset, abbr='ARC-e', - path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl', + path='opencompass/ai2_arc-easy-dev', + name='ARC-Easy', reader_cfg=ARC_e_reader_cfg, infer_cfg=ARC_e_infer_cfg, eval_cfg=ARC_e_eval_cfg) diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py index 4db6b40e..dcee88cd 100644 --- a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py @@ -28,7 +28,7 @@ CMRC_datasets = [ dict( type=CMRCDataset, abbr='CMRC_dev', - path='./data/CLUE/CMRC/dev.json', + path='opencompass/cmrc_dev', reader_cfg=CMRC_reader_cfg, infer_cfg=CMRC_infer_cfg, eval_cfg=CMRC_eval_cfg), diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py index 3cf28366..fd289be6 100644 --- a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py @@ -26,7 +26,7 @@ CMRC_datasets = [ dict( type=CMRCDataset, abbr='CMRC_dev', - path='./data/CLUE/CMRC/dev.json', + path='opencompass/cmrc_dev', reader_cfg=CMRC_reader_cfg, infer_cfg=CMRC_infer_cfg, eval_cfg=CMRC_eval_cfg), diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py index d95de3c5..a1c0e442 100644 --- a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py @@ -20,7 +20,7 @@ CMRC_datasets = [ dict( type=CMRCDataset, abbr='CMRC_dev', - path='./data/CLUE/CMRC/dev.json', + path='opencompass/cmrc_dev', reader_cfg=CMRC_reader_cfg, infer_cfg=CMRC_infer_cfg, eval_cfg=CMRC_eval_cfg), diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py index 1ccef48b..d192a86c 100644 --- a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py @@ -27,7 +27,7 @@ CMRC_datasets = [ dict( type=CMRCDataset, abbr='CMRC_dev', - path='./data/CLUE/CMRC/dev.json', + path='opencompass/cmrc_dev', reader_cfg=CMRC_reader_cfg, infer_cfg=CMRC_infer_cfg, eval_cfg=CMRC_eval_cfg), diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py index 67270061..b7df6302 100644 --- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py @@ -29,7 +29,7 @@ DRCD_datasets = [ dict( type=DRCDDataset, abbr='DRCD_dev', - path='./data/CLUE/DRCD/dev.json', + path='opencompass/drcd_dev', reader_cfg=DRCD_reader_cfg, infer_cfg=DRCD_infer_cfg, eval_cfg=DRCD_eval_cfg), diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_3749cd.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_3749cd.py index db958e71..ca4f6394 100644 --- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_3749cd.py +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_3749cd.py @@ -26,7 +26,7 @@ DRCD_datasets = [ dict( type=DRCDDataset, abbr='DRCD_dev', - path='./data/CLUE/DRCD/dev.json', + path='opencompass/drcd_dev', reader_cfg=DRCD_reader_cfg, infer_cfg=DRCD_infer_cfg, eval_cfg=DRCD_eval_cfg), diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_8484b9.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_8484b9.py index 1f735f7e..633f66fb 100644 --- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_8484b9.py +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_8484b9.py @@ -20,7 +20,7 @@ DRCD_datasets = [ dict( type=DRCDDataset, abbr='DRCD_dev', - path='./data/CLUE/DRCD/dev.json', + path='opencompass/drcd_dev', reader_cfg=DRCD_reader_cfg, infer_cfg=DRCD_infer_cfg, eval_cfg=DRCD_eval_cfg), diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_941108.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_941108.py index 5d2acf77..14e91ff4 100644 --- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_941108.py +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_941108.py @@ -27,7 +27,7 @@ DRCD_datasets = [ dict( type=DRCDDataset, abbr='DRCD_dev', - path='./data/CLUE/DRCD/dev.json', + path='opencompass/drcd_dev', reader_cfg=DRCD_reader_cfg, infer_cfg=DRCD_infer_cfg, eval_cfg=DRCD_eval_cfg), diff --git a/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py b/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py index 214c35cb..6d2d8ff8 100644 --- a/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py +++ b/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AFQMCDataset_V2 +from opencompass.datasets import AFQMCDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess afqmc_reader_cfg = dict( @@ -34,8 +34,8 @@ afqmc_eval_cfg = dict( afqmc_datasets = [ dict( abbr='afqmc-dev', - type=AFQMCDataset_V2, - path='./data/CLUE/AFQMC/dev.json', + type=AFQMCDatasetV2, + path='opencompass/afqmc-dev', reader_cfg=afqmc_reader_cfg, infer_cfg=afqmc_infer_cfg, eval_cfg=afqmc_eval_cfg, diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py index e20d45a0..c4979b86 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset_V2 +from opencompass.datasets import CMNLIDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess cmnli_reader_cfg = dict( @@ -34,8 +34,8 @@ cmnli_eval_cfg = dict( cmnli_datasets = [ dict( abbr='cmnli', - type=cmnliDataset_V2, - path='./data/CLUE/cmnli/cmnli_public/dev.json', + type=CMNLIDatasetV2, + path='opencompass/cmnli-dev', reader_cfg=cmnli_reader_cfg, infer_cfg=cmnli_infer_cfg, eval_cfg=cmnli_eval_cfg, diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py index 298db208..344d6081 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset_V2 +from opencompass.datasets import CMNLIDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess cmnli_reader_cfg = dict( @@ -34,8 +34,8 @@ cmnli_eval_cfg = dict( cmnli_datasets = [ dict( abbr='cmnli', - type=cmnliDataset_V2, - path='./data/CLUE/cmnli/cmnli_public/dev.json', + type=CMNLIDatasetV2, + path='opencompass/cmnli-dev', reader_cfg=cmnli_reader_cfg, infer_cfg=cmnli_infer_cfg, eval_cfg=cmnli_eval_cfg, diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py index 8f040fbf..db38a213 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset +from opencompass.datasets import CMNLIDataset cmnli_reader_cfg = dict( input_columns=['sentence1', 'sentence2'], @@ -26,8 +26,8 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) cmnli_datasets = [ dict( abbr='cmnli', - type=cmnliDataset, - path='./data/CLUE/cmnli/cmnli_public/dev.json', + type=CMNLIDataset, + path='opencompass/cmnli-dev', reader_cfg=cmnli_reader_cfg, infer_cfg=cmnli_infer_cfg, eval_cfg=cmnli_eval_cfg) diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py index 2f88fc20..c89a9419 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset +from opencompass.datasets import CMNLIDataset cmnli_reader_cfg = dict( input_columns=['sentence1', 'sentence2'], @@ -42,8 +42,8 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) cmnli_datasets = [ dict( abbr='cmnli', - type=cmnliDataset, - path='./data/CLUE/cmnli/cmnli_public/dev.json', + type=CMNLIDataset, + path='opencompass/cmnli-dev', reader_cfg=cmnli_reader_cfg, infer_cfg=cmnli_infer_cfg, eval_cfg=cmnli_eval_cfg) diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py index fc786ea0..460c6873 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset +from opencompass.datasets import CMNLIDataset cmnli_reader_cfg = dict( input_columns=['sentence1', 'sentence2'], @@ -46,8 +46,8 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) cmnli_datasets = [ dict( abbr='cmnli', - type=cmnliDataset, - path='./data/CLUE/cmnli/cmnli_public/dev.json', + type=CMNLIDataset, + path='opencompass/cmnli-dev', reader_cfg=cmnli_reader_cfg, infer_cfg=cmnli_infer_cfg, eval_cfg=cmnli_eval_cfg) diff --git a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py index 4d4ddb4c..bb1cdd58 100644 --- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py +++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset_V2 +from opencompass.datasets import CMNLIDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_reader_cfg = dict( @@ -35,8 +35,8 @@ ocnli_eval_cfg = dict( ocnli_datasets = [ dict( abbr='ocnli', - type=cmnliDataset_V2, # ocnli share the same format with cmnli - path='./data/CLUE/OCNLI/dev.json', + type=CMNLIDatasetV2, # ocnli share the same format with cmnli + path='opencompass/OCNLI-dev', reader_cfg=ocnli_reader_cfg, infer_cfg=ocnli_infer_cfg, eval_cfg=ocnli_eval_cfg, diff --git a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py index 28d1e71a..5b2bb9fc 100644 --- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py +++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset_V2 +from opencompass.datasets import CMNLIDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_reader_cfg = dict( @@ -35,8 +35,8 @@ ocnli_eval_cfg = dict( ocnli_datasets = [ dict( abbr='ocnli', - type=cmnliDataset_V2, # ocnli share the same format with cmnli - path='./data/CLUE/OCNLI/dev.json', + type=CMNLIDatasetV2, # ocnli share the same format with cmnli + path='opencompass/OCNLI-dev', reader_cfg=ocnli_reader_cfg, infer_cfg=ocnli_infer_cfg, eval_cfg=ocnli_eval_cfg, diff --git a/configs/datasets/ChemBench/ChemBench_gen.py b/configs/datasets/ChemBench/ChemBench_gen.py index c81c6df0..67fff5c9 100644 --- a/configs/datasets/ChemBench/ChemBench_gen.py +++ b/configs/datasets/ChemBench/ChemBench_gen.py @@ -67,7 +67,7 @@ for _name in chembench_all_sets: dict( abbr=f'ChemBench_{_name}', type=ChemBenchDataset, - path='./data/ChemBench/', + path='opencompass/ChemBench', name=_name, reader_cfg=chembench_reader_cfg, infer_cfg=chembench_infer_cfg, diff --git a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py index fc7bbaf4..beb5d0a3 100644 --- a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py +++ b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AFQMCDataset_V2 +from opencompass.datasets import AFQMCDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess bustm_reader_cfg = dict( @@ -34,16 +34,18 @@ bustm_eval_cfg = dict( bustm_datasets = [ dict( abbr='bustm-dev', - type=AFQMCDataset_V2, # bustm share the same format with AFQMC + type=AFQMCDatasetV2, # bustm share the same format with AFQMC path='./data/FewCLUE/bustm/dev_few_all.json', + local_mode=True, reader_cfg=bustm_reader_cfg, infer_cfg=bustm_infer_cfg, eval_cfg=bustm_eval_cfg, ), dict( abbr='bustm-test', - type=AFQMCDataset_V2, # bustm share the same format with AFQMC + type=AFQMCDatasetV2, # bustm share the same format with AFQMC path='./data/FewCLUE/bustm/test_public.json', + local_mode=True, reader_cfg=bustm_reader_cfg, infer_cfg=bustm_infer_cfg, eval_cfg=bustm_eval_cfg, diff --git a/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py b/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py index 38459679..85d90e43 100644 --- a/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py +++ b/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CHIDDataset_V2 +from opencompass.datasets import CHIDDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess chid_reader_cfg = dict( @@ -34,7 +34,7 @@ chid_eval_cfg = dict( chid_datasets = [ dict( abbr='chid-dev', - type=CHIDDataset_V2, + type=CHIDDatasetV2, path='./data/FewCLUE/chid/dev_few_all.json', reader_cfg=chid_reader_cfg, infer_cfg=chid_infer_cfg, @@ -42,7 +42,7 @@ chid_datasets = [ ), dict( abbr='chid-test', - type=CHIDDataset_V2, + type=CHIDDatasetV2, path='./data/FewCLUE/chid/test_public.json', reader_cfg=chid_reader_cfg, infer_cfg=chid_infer_cfg, diff --git a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py index f2c0ad03..41d54256 100644 --- a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py +++ b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CluewscDataset_V2 +from opencompass.datasets import CluewscDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess cluewsc_reader_cfg = dict( @@ -34,7 +34,7 @@ cluewsc_eval_cfg = dict( cluewsc_datasets = [ dict( abbr='cluewsc-dev', - type=CluewscDataset_V2, + type=CluewscDatasetV2, path='./data/FewCLUE/cluewsc/dev_few_all.json', reader_cfg=cluewsc_reader_cfg, infer_cfg=cluewsc_infer_cfg, @@ -42,7 +42,7 @@ cluewsc_datasets = [ ), dict( abbr='cluewsc-test', - type=CluewscDataset_V2, + type=CluewscDatasetV2, path='./data/FewCLUE/cluewsc/test_public.json', reader_cfg=cluewsc_reader_cfg, infer_cfg=cluewsc_infer_cfg, diff --git a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py index ea7c80e5..4f37d361 100644 --- a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py +++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CslDataset_V2 +from opencompass.datasets import CslDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess csl_reader_cfg = dict( @@ -34,7 +34,7 @@ csl_eval_cfg = dict( csl_datasets = [ dict( abbr='csl_dev', - type=CslDataset_V2, + type=CslDatasetV2, path='./data/FewCLUE/csl/dev_few_all.json', reader_cfg=csl_reader_cfg, infer_cfg=csl_infer_cfg, @@ -42,7 +42,7 @@ csl_datasets = [ ), dict( abbr='csl_test', - type=CslDataset_V2, + type=CslDatasetV2, path='./data/FewCLUE/csl/test_public.json', reader_cfg=csl_reader_cfg, infer_cfg=csl_infer_cfg, diff --git a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py index e3f3e980..db74b7a7 100644 --- a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py +++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CslDataset_V2 +from opencompass.datasets import CslDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess csl_reader_cfg = dict( @@ -34,7 +34,7 @@ csl_eval_cfg = dict( csl_datasets = [ dict( abbr='csl_dev', - type=CslDataset_V2, + type=CslDatasetV2, path='./data/FewCLUE/csl/dev_few_all.json', reader_cfg=csl_reader_cfg, infer_cfg=csl_infer_cfg, @@ -42,7 +42,7 @@ csl_datasets = [ ), dict( abbr='csl_test', - type=CslDataset_V2, + type=CslDatasetV2, path='./data/FewCLUE/csl/test_public.json', reader_cfg=csl_reader_cfg, infer_cfg=csl_infer_cfg, diff --git a/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py b/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py index a0c3341d..534639ec 100644 --- a/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py +++ b/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import eprstmtDataset_V2 +from opencompass.datasets import EprstmtDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess eprstmt_reader_cfg = dict( @@ -32,7 +32,7 @@ eprstmt_eval_cfg = dict( eprstmt_datasets = [ dict( abbr='eprstmt-dev', - type=eprstmtDataset_V2, + type=EprstmtDatasetV2, path='./data/FewCLUE/eprstmt/dev_few_all.json', reader_cfg=eprstmt_reader_cfg, infer_cfg=eprstmt_infer_cfg, @@ -40,7 +40,7 @@ eprstmt_datasets = [ ), dict( abbr='eprstmt-test', - type=eprstmtDataset_V2, + type=EprstmtDatasetV2, path='./data/FewCLUE/eprstmt/test_public.json', reader_cfg=eprstmt_reader_cfg, infer_cfg=eprstmt_infer_cfg, diff --git a/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py b/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py index 38b1ce4d..242e3ad6 100644 --- a/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py +++ b/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import cmnliDataset_V2 +from opencompass.datasets import CMNLIDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_fc_reader_cfg = dict( @@ -33,16 +33,18 @@ ocnli_fc_eval_cfg = dict( ocnli_fc_datasets = [ dict( abbr='ocnli_fc-dev', - type=cmnliDataset_V2, # ocnli_fc share the same format with cmnli + type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli path='./data/FewCLUE/ocnli/dev_few_all.json', + local_mode=True, reader_cfg=ocnli_fc_reader_cfg, infer_cfg=ocnli_fc_infer_cfg, eval_cfg=ocnli_fc_eval_cfg, ), dict( abbr='ocnli_fc-test', - type=cmnliDataset_V2, # ocnli_fc share the same format with cmnli + type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli path='./data/FewCLUE/ocnli/test_public.json', + local_mode=True, reader_cfg=ocnli_fc_reader_cfg, infer_cfg=ocnli_fc_infer_cfg, eval_cfg=ocnli_fc_eval_cfg, diff --git a/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py index fc9ad76a..ae8979df 100644 --- a/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py +++ b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import TNewsDataset_V2 +from opencompass.datasets import TNewsDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess tnews_reader_cfg = dict( @@ -56,7 +56,7 @@ tnews_eval_cfg = dict( tnews_datasets = [ dict( abbr='tnews-dev', - type=TNewsDataset_V2, + type=TNewsDatasetV2, path='./data/FewCLUE/tnews/dev_few_all.json', reader_cfg=tnews_reader_cfg, infer_cfg=tnews_infer_cfg, @@ -64,7 +64,7 @@ tnews_datasets = [ ), dict( abbr='tnews-test', - type=TNewsDataset_V2, + type=TNewsDatasetV2, path='./data/FewCLUE/tnews/test_public.json', reader_cfg=tnews_reader_cfg, infer_cfg=tnews_infer_cfg, diff --git a/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py b/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py index 5e45b1ca..480eeadb 100644 --- a/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +++ b/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GaokaoBenchDataset + _MCQ_prompts = [ { 'type': 'single_choice', @@ -288,6 +289,7 @@ for _folder, _prompts in [ 'type': GaokaoBenchDataset, 'abbr': 'GaokaoBench_' + _p['keyword'], 'path': _base_path + '/' + _folder + '/' + _p['keyword'] + '.json', + 'name': _p['keyword'], 'reader_cfg': _reader_cfg, 'infer_cfg': _infer_cfg, 'eval_cfg': _eval_cfg, diff --git a/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py b/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py index 540430f0..637f4f51 100644 --- a/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +++ b/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer from opencompass.datasets import GaokaoBenchDataset - _MCQ_prompts = [ { 'type': 'single_choice', @@ -290,6 +289,7 @@ for _folder, _prompts in [ 'type': GaokaoBenchDataset, 'abbr': 'GaokaoBench_' + _p['keyword'], 'path': _base_path + '/' + _folder + '/' + _p['keyword'] + '.json', + 'name': _p['keyword'], 'reader_cfg': _reader_cfg, 'infer_cfg': _infer_cfg, 'eval_cfg': _eval_cfg, @@ -340,6 +340,7 @@ for _p in _MCQ_prompts: 'type': GaokaoBenchDataset, 'abbr': 'GaokaoBench_' + _p['keyword'], 'path': _base_path + '/' + _folder + '/' + _p['keyword'] + '.json', + 'name': _p['keyword'], 'reader_cfg': _reader_cfg, 'infer_cfg': _infer_cfg, 'eval_cfg': _eval_cfg, diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py index ebb20a58..e3c251aa 100644 --- a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py @@ -35,6 +35,7 @@ for folder, prompts in [ 'type': GaokaoBenchDataset, 'abbr': 'GaokaoBench_' + p['keyword'], 'path': os.path.join('data', 'GAOKAO-BENCH', 'data', folder, p['keyword'] + '.json'), + 'name': p['keyword'], 'reader_cfg': reader_cfg, 'infer_cfg': infer_cfg, 'eval_cfg': eval_cfg, diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py index dd1c0d5c..1f50030b 100644 --- a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py +++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py @@ -34,6 +34,7 @@ for folder, prompts in [ 'type': GaokaoBenchDataset, 'abbr': 'GaokaoBench_' + p['keyword'], 'path': os.path.join('data', 'GAOKAO-BENCH', 'data', folder, p['keyword'] + '.json'), + 'name': p['keyword'], 'reader_cfg': reader_cfg, 'infer_cfg': infer_cfg, 'eval_cfg': eval_cfg, diff --git a/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py b/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py index 436721e9..e700c82f 100644 --- a/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py +++ b/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py @@ -2,27 +2,27 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets.NPHardEval import ( - hard_GCP_Dataset, hard_GCP_Evaluator, - hard_TSP_Dataset, hard_TSP_Evaluator, - hard_MSP_Dataset, hard_MSP_Evaluator, - cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator, - cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator, - cmp_KSP_Dataset, cmp_KSP_Evaluator, - p_BSP_Dataset, p_BSP_Evaluator, - p_EDP_Dataset, p_EDP_Evaluator, - p_SPP_Dataset, p_SPP_Evaluator, + HardGCPDataset, HardGCPEvaluator, + Hard_TSP_Dataset, Hard_TSP_Evaluator, + Hard_MSP_Dataset, Hard_MSP_Evaluator, + CMP_GCP_D_Dataset, CMP_GCP_D_Evaluator, + CMP_TSP_D_Dataset, CMP_TSP_D_Evaluator, + CMP_KSP_Dataset, CMP_KSP_Evaluator, + P_BSP_Dataset, P_BSP_Evaluator, + P_EDP_Dataset, P_EDP_Evaluator, + P_SPP_Dataset, P_SPP_Evaluator, ) NPHardEval_tasks = [ - ['hard_GCP', 'GCP', hard_GCP_Dataset, hard_GCP_Evaluator], - ['hard_TSP', 'TSP', hard_TSP_Dataset, hard_TSP_Evaluator], - ['hard_MSP', 'MSP', hard_MSP_Dataset, hard_MSP_Evaluator], - ['cmp_GCP_D', 'GCP_Decision', cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator], - ['cmp_TSP_D', 'TSP_Decision', cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator], - ['cmp_KSP', 'KSP', cmp_KSP_Dataset, cmp_KSP_Evaluator], - ['p_BSP', 'BSP', p_BSP_Dataset, p_BSP_Evaluator], - ['p_EDP', 'EDP', p_EDP_Dataset, p_EDP_Evaluator], - ['p_SPP', 'SPP', p_SPP_Dataset, p_SPP_Evaluator], + ['hard_GCP', 'GCP', HardGCPDataset, HardGCPEvaluator], + ['hard_TSP', 'TSP', Hard_TSP_Dataset, Hard_TSP_Evaluator], + ['hard_MSP', 'MSP', Hard_MSP_Dataset, Hard_MSP_Evaluator], + ['cmp_GCP_D', 'GCP_Decision', CMP_GCP_D_Dataset, CMP_GCP_D_Evaluator], + ['cmp_TSP_D', 'TSP_Decision', CMP_TSP_D_Dataset, CMP_TSP_D_Evaluator], + ['cmp_KSP', 'KSP', CMP_KSP_Dataset, CMP_KSP_Evaluator], + ['p_BSP', 'BSP', P_BSP_Dataset, P_BSP_Evaluator], + ['p_EDP', 'EDP', P_EDP_Dataset, P_EDP_Evaluator], + ['p_SPP', 'SPP', P_SPP_Dataset, P_SPP_Evaluator], ] NPHardEval_datasets = [] diff --git a/configs/datasets/OpenFinData/OpenFinData.md b/configs/datasets/OpenFinData/README.md similarity index 100% rename from configs/datasets/OpenFinData/OpenFinData.md rename to configs/datasets/OpenFinData/README.md diff --git a/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py b/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py index b1fd5443..1f81f20e 100644 --- a/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py +++ b/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AXDataset_V2 +from opencompass.datasets import AXDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess AX_b_reader_cfg = dict( @@ -34,7 +34,7 @@ AX_b_eval_cfg = dict( AX_b_datasets = [ dict( abbr='AX_b', - type=AXDataset_V2, + type=AXDatasetV2, path='./data/SuperGLUE/AX-b/AX-b.jsonl', reader_cfg=AX_b_reader_cfg, infer_cfg=AX_b_infer_cfg, diff --git a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py index dbf7def4..b2746d55 100644 --- a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py +++ b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AXDataset_V2 +from opencompass.datasets import AXDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess AX_g_reader_cfg = dict( @@ -34,7 +34,7 @@ AX_g_eval_cfg = dict( AX_g_datasets = [ dict( abbr='AX_g', - type=AXDataset_V2, + type=AXDatasetV2, path='./data/SuperGLUE/AX-g/AX-g.jsonl', reader_cfg=AX_g_reader_cfg, infer_cfg=AX_g_infer_cfg, diff --git a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py index 973a6040..01301860 100644 --- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py +++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BoolQDataset_V2 +from opencompass.datasets import BoolQDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess BoolQ_reader_cfg = dict( @@ -32,7 +32,7 @@ BoolQ_eval_cfg = dict( BoolQ_datasets = [ dict( abbr='BoolQ', - type=BoolQDataset_V2, + type=BoolQDatasetV2, path='./data/SuperGLUE/BoolQ/val.jsonl', reader_cfg=BoolQ_reader_cfg, infer_cfg=BoolQ_infer_cfg, diff --git a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py index 77249122..cb0980ce 100644 --- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py +++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BoolQDataset_V3 +from opencompass.datasets import BoolQDatasetV3 BoolQ_reader_cfg = dict( input_columns=['question', 'passage'], @@ -34,7 +34,7 @@ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) BoolQ_datasets = [ dict( abbr='BoolQ', - type=BoolQDataset_V3, + type=BoolQDatasetV3, path='./data/SuperGLUE/BoolQ/val.jsonl', reader_cfg=BoolQ_reader_cfg, infer_cfg=BoolQ_infer_cfg, diff --git a/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py index de750b9e..65d3752d 100644 --- a/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py +++ b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CBDataset_V2 +from opencompass.datasets import CBDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess CB_reader_cfg = dict( @@ -35,7 +35,7 @@ CB_eval_cfg = dict( CB_datasets = [ dict( abbr='CB', - type=CBDataset_V2, + type=CBDatasetV2, path='./data/SuperGLUE/CB/val.jsonl', reader_cfg=CB_reader_cfg, infer_cfg=CB_infer_cfg, diff --git a/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py index 0abe3f50..97c5bacf 100644 --- a/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py +++ b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import COPADataset_V2 +from opencompass.datasets import COPADatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess COPA_reader_cfg = dict( @@ -35,7 +35,7 @@ COPA_eval_cfg = dict( COPA_datasets = [ dict( abbr='COPA', - type=COPADataset_V2, + type=COPADatasetV2, path='./data/SuperGLUE/COPA/val.jsonl', reader_cfg=COPA_reader_cfg, infer_cfg=COPA_infer_cfg, diff --git a/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py index 783b4379..8444e9db 100644 --- a/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py +++ b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import MultiRCDataset_V2 +from opencompass.datasets import MultiRCDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess MultiRC_reader_cfg = dict( @@ -34,7 +34,7 @@ MultiRC_eval_cfg = dict( MultiRC_datasets = [ dict( abbr='MultiRC', - type=MultiRCDataset_V2, + type=MultiRCDatasetV2, path='./data/SuperGLUE/MultiRC/val.jsonl', reader_cfg=MultiRC_reader_cfg, infer_cfg=MultiRC_infer_cfg, diff --git a/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py b/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py index ac6a016d..b75dc133 100644 --- a/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py +++ b/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AXDataset_V2 +from opencompass.datasets import AXDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess RTE_reader_cfg = dict( @@ -34,7 +34,7 @@ RTE_eval_cfg = dict( RTE_datasets = [ dict( abbr='RTE', - type=AXDataset_V2, # rte share the same format with ax + type=AXDatasetV2, # rte share the same format with ax path='./data/SuperGLUE/RTE/val.jsonl', reader_cfg=RTE_reader_cfg, infer_cfg=RTE_infer_cfg, diff --git a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py index c3889075..40b64e30 100644 --- a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py +++ b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import EMEvaluator -from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess +from opencompass.datasets import ReCoRDDatasetV2, ReCoRD_postprocess ReCoRD_reader_cfg = dict( input_columns=['question', 'text'], output_column='answers') @@ -26,7 +26,7 @@ ReCoRD_eval_cfg = dict( ReCoRD_datasets = [ dict( - type=ReCoRDDataset_V2, + type=ReCoRDDatasetV2, abbr='ReCoRD', path='./data/SuperGLUE/ReCoRD/val.jsonl', reader_cfg=ReCoRD_reader_cfg, diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_7902a7.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_7902a7.py index 9308112c..c6f7c45d 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_7902a7.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_7902a7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import WSCDataset_V2 +from opencompass.datasets import WSCDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess WSC_reader_cfg = dict( @@ -34,7 +34,7 @@ WSC_eval_cfg = dict( WSC_datasets = [ dict( abbr='WSC', - type=WSCDataset_V2, + type=WSCDatasetV2, path='./data/SuperGLUE/WSC/val.jsonl', reader_cfg=WSC_reader_cfg, infer_cfg=WSC_infer_cfg, diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_fe4bf3.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_fe4bf3.py index 4945f4de..b9edc46c 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_fe4bf3.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_fe4bf3.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import WSCDataset_V3 +from opencompass.datasets import WSCDatasetV3 from opencompass.utils.text_postprocessors import first_capital_postprocess WSC_reader_cfg = dict( @@ -34,7 +34,7 @@ WSC_eval_cfg = dict( WSC_datasets = [ dict( abbr='WSC', - type=WSCDataset_V3, + type=WSCDatasetV3, path='./data/SuperGLUE/WSC/val.jsonl', reader_cfg=WSC_reader_cfg, infer_cfg=WSC_infer_cfg, diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_1c4a90.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_1c4a90.py index dc452635..56b86088 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_1c4a90.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_1c4a90.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import WSCDataset_V3 +from opencompass.datasets import WSCDatasetV3 WSC_reader_cfg = dict( input_columns=['span1', 'span2', 'text'], @@ -40,7 +40,7 @@ WSC_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) WSC_datasets = [ dict( abbr='WSC', - type=WSCDataset_V3, + type=WSCDatasetV3, path='./data/SuperGLUE/WSC/val.jsonl', reader_cfg=WSC_reader_cfg, infer_cfg=WSC_infer_cfg, diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_d0f531.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_d0f531.py index dda2d6d6..49a9b6df 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_d0f531.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_d0f531.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import WSCDataset_V2 +from opencompass.datasets import WSCDatasetV2 WSC_reader_cfg = dict( input_columns=['span1', 'span2', 'text'], @@ -42,7 +42,7 @@ WSC_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) WSC_datasets = [ dict( abbr='WSC', - type=WSCDataset_V2, + type=WSCDatasetV2, path='./data/SuperGLUE/WSC/val.jsonl', reader_cfg=WSC_reader_cfg, infer_cfg=WSC_infer_cfg, diff --git a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py index 76dd782f..c4f5c023 100644 --- a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py +++ b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import WiCDataset_V2 +from opencompass.datasets import WiCDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess WiC_reader_cfg = dict( @@ -38,7 +38,7 @@ WiC_eval_cfg = dict( WiC_datasets = [ dict( abbr='WiC', - type=WiCDataset_V2, + type=WiCDatasetV2, path='./data/SuperGLUE/WiC/val.jsonl', reader_cfg=WiC_reader_cfg, infer_cfg=WiC_infer_cfg, diff --git a/configs/datasets/Xsum/Xsum_gen_31397e.py b/configs/datasets/Xsum/Xsum_gen_31397e.py index c1556ffb..7c92f5cc 100644 --- a/configs/datasets/Xsum/Xsum_gen_31397e.py +++ b/configs/datasets/Xsum/Xsum_gen_31397e.py @@ -31,7 +31,7 @@ Xsum_datasets = [ dict( type=XsumDataset, abbr='Xsum', - path='./data/Xsum/dev.jsonl', + path='opencompass/xsum', reader_cfg=Xsum_reader_cfg, infer_cfg=Xsum_infer_cfg, eval_cfg=Xsum_eval_cfg, diff --git a/configs/datasets/Xsum/Xsum_gen_8ea5f8.py b/configs/datasets/Xsum/Xsum_gen_8ea5f8.py index 364e7004..1e894452 100644 --- a/configs/datasets/Xsum/Xsum_gen_8ea5f8.py +++ b/configs/datasets/Xsum/Xsum_gen_8ea5f8.py @@ -23,7 +23,7 @@ Xsum_datasets = [ dict( type=XsumDataset, abbr='Xsum', - path='./data/Xsum/dev.jsonl', + path='opencompass/xsum', reader_cfg=Xsum_reader_cfg, infer_cfg=Xsum_infer_cfg, eval_cfg=Xsum_eval_cfg) diff --git a/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py index 3c2d7630..2129bf05 100644 --- a/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py +++ b/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen_bd8ef0.py @@ -34,7 +34,7 @@ adv_mnli_datasets = [ dict( abbr='adv_mnli', type=AdvMnliDataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_mnli_reader_cfg, infer_cfg=adv_mnli_infer_cfg, eval_cfg=adv_mnli_eval_cfg, diff --git a/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py index aef5f3f3..e4b42208 100644 --- a/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py +++ b/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen_bd8ef0.py @@ -34,7 +34,7 @@ adv_mnli_mm_datasets = [ dict( abbr='adv_mnli_mm', type=AdvMnliMMDataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_mnli_mm_reader_cfg, infer_cfg=adv_mnli_mm_infer_cfg, eval_cfg=adv_mnli_mm_eval_cfg, diff --git a/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py index c90a11cf..a42a6917 100644 --- a/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py +++ b/configs/datasets/adv_glue/adv_glue_qnli/adv_glue_qnli_gen_0b7326.py @@ -34,7 +34,7 @@ adv_qnli_datasets = [ dict( abbr='adv_qnli', type=AdvQnliDataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_qnli_reader_cfg, infer_cfg=adv_qnli_infer_cfg, eval_cfg=adv_qnli_eval_cfg, diff --git a/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py index 0c82888d..01adfb01 100644 --- a/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py +++ b/configs/datasets/adv_glue/adv_glue_qqp/adv_glue_qqp_gen_cdc277.py @@ -34,7 +34,7 @@ adv_qqp_datasets = [ dict( abbr='adv_qqp', type=AdvQqpDataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_qqp_reader_cfg, infer_cfg=adv_qqp_infer_cfg, eval_cfg=adv_qqp_eval_cfg, diff --git a/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py index 925a9985..ef95f47b 100644 --- a/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py +++ b/configs/datasets/adv_glue/adv_glue_rte/adv_glue_rte_gen_8cc547.py @@ -34,7 +34,7 @@ adv_rte_datasets = [ dict( abbr='adv_rte', type=AdvRteDataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_rte_reader_cfg, infer_cfg=adv_rte_infer_cfg, eval_cfg=adv_rte_eval_cfg, diff --git a/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py index 19ce222d..dbb8d6bd 100644 --- a/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py +++ b/configs/datasets/adv_glue/adv_glue_sst2/adv_glue_sst2_gen_ee8d3b.py @@ -33,7 +33,7 @@ adv_sst2_datasets = [ dict( abbr='adv_sst2', type=AdvSst2Dataset, - path='./data/adv_glue/dev_ann.json', + path='opencompass/advglue-dev', reader_cfg=adv_sst2_reader_cfg, infer_cfg=adv_sst2_infer_cfg, eval_cfg=adv_sst2_eval_cfg, diff --git a/configs/datasets/agieval/agieval_gen_397d81.py b/configs/datasets/agieval/agieval_gen_397d81.py index 523cb074..0183f219 100644 --- a/configs/datasets/agieval/agieval_gen_397d81.py +++ b/configs/datasets/agieval/agieval_gen_397d81.py @@ -88,7 +88,7 @@ for _name in agieval_single_choice_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -117,7 +117,7 @@ for _name in agieval_multiple_choices_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -143,7 +143,7 @@ for _name in agieval_cloze_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', diff --git a/configs/datasets/agieval/agieval_gen_617738.py b/configs/datasets/agieval/agieval_gen_617738.py index f68d5d7c..03096ee6 100644 --- a/configs/datasets/agieval/agieval_gen_617738.py +++ b/configs/datasets/agieval/agieval_gen_617738.py @@ -92,7 +92,7 @@ for _name in agieval_single_choice_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -122,7 +122,7 @@ for _name in agieval_multiple_choices_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -148,7 +148,7 @@ for _name in agieval_cloze_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', diff --git a/configs/datasets/agieval/agieval_gen_64afd3.py b/configs/datasets/agieval/agieval_gen_64afd3.py index 2bb73f24..03378fc0 100644 --- a/configs/datasets/agieval/agieval_gen_64afd3.py +++ b/configs/datasets/agieval/agieval_gen_64afd3.py @@ -90,7 +90,7 @@ for _name in agieval_single_choice_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -120,7 +120,7 @@ for _name in agieval_multiple_choices_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -146,7 +146,7 @@ for _name in agieval_cloze_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', diff --git a/configs/datasets/agieval/agieval_gen_a0c741.py b/configs/datasets/agieval/agieval_gen_a0c741.py index df5fcbd2..8e726887 100644 --- a/configs/datasets/agieval/agieval_gen_a0c741.py +++ b/configs/datasets/agieval/agieval_gen_a0c741.py @@ -50,7 +50,7 @@ for name in agieval_single_choice_sets: agieval_datasets.append( dict( type=AGIEvalDataset, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=name, abbr='agieval-' + name, setting_name='zero-shot', @@ -74,7 +74,7 @@ for name in agieval_multiple_choices_sets + agieval_cloze_sets: agieval_datasets.append( dict( type=AGIEvalDataset, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=name, abbr='agieval-' + name, setting_name='zero-shot', diff --git a/configs/datasets/agieval/agieval_mixed_0fa998.py b/configs/datasets/agieval/agieval_mixed_0fa998.py index 552a1bad..6724b2ae 100644 --- a/configs/datasets/agieval/agieval_mixed_0fa998.py +++ b/configs/datasets/agieval/agieval_mixed_0fa998.py @@ -93,7 +93,7 @@ for _name in agieval_single_choice_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -124,7 +124,7 @@ for _name in agieval_multiple_choices_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', @@ -151,7 +151,7 @@ for _name in agieval_cloze_sets: agieval_datasets.append( dict( type=AGIEvalDataset_v2, - path='./data/AGIEval/data/v1/', + path='opencompass/agieval', name=_name, abbr='agieval-' + _name, setting_name='zero-shot', diff --git a/configs/datasets/bbh/bbh_gen_2879b0.py b/configs/datasets/bbh/bbh_gen_2879b0.py index ca247076..6cb4d01b 100644 --- a/configs/datasets/bbh/bbh_gen_2879b0.py +++ b/configs/datasets/bbh/bbh_gen_2879b0.py @@ -48,7 +48,7 @@ for name, test_type in settings: bbh_datasets.append( dict( type=BBHDataset, - path='./data/BBH/data', + path='opencompass/bbh', name=name, abbr='bbh-' + name, reader_cfg=bbh_reader_cfg.copy(), diff --git a/configs/datasets/bbh/bbh_gen_5b92b0.py b/configs/datasets/bbh/bbh_gen_5b92b0.py index 0720b855..4c675674 100644 --- a/configs/datasets/bbh/bbh_gen_5b92b0.py +++ b/configs/datasets/bbh/bbh_gen_5b92b0.py @@ -64,7 +64,7 @@ for _name in bbh_multiple_choice_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, @@ -91,7 +91,7 @@ for _name in bbh_free_form_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, diff --git a/configs/datasets/bbh/bbh_gen_5bf00b.py b/configs/datasets/bbh/bbh_gen_5bf00b.py index 8951ef3a..08288488 100644 --- a/configs/datasets/bbh/bbh_gen_5bf00b.py +++ b/configs/datasets/bbh/bbh_gen_5bf00b.py @@ -64,7 +64,7 @@ for _name in bbh_multiple_choice_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, @@ -91,7 +91,7 @@ for _name in bbh_free_form_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, diff --git a/configs/datasets/bbh/bbh_gen_98fba6.py b/configs/datasets/bbh/bbh_gen_98fba6.py index 78edd95b..5a7c7bdf 100644 --- a/configs/datasets/bbh/bbh_gen_98fba6.py +++ b/configs/datasets/bbh/bbh_gen_98fba6.py @@ -59,7 +59,7 @@ for _name in bbh_multiple_choice_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, @@ -82,7 +82,7 @@ for _name in bbh_free_form_sets: bbh_datasets.append( dict( type=BBHDataset, - path=f'./data/BBH/data', + path='opencompass/bbh', name=_name, abbr='bbh-' + _name, reader_cfg=bbh_reader_cfg, diff --git a/configs/datasets/ceval/ceval_clean_ppl.py b/configs/datasets/ceval/ceval_clean_ppl.py index c3f60efd..f22b153f 100644 --- a/configs/datasets/ceval/ceval_clean_ppl.py +++ b/configs/datasets/ceval/ceval_clean_ppl.py @@ -5,6 +5,7 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccContaminationEvaluator from opencompass.datasets import CEvalDatasetClean as CEvalDataset + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -92,7 +93,7 @@ for _split in ['val']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, reader_cfg=dict( diff --git a/configs/datasets/ceval/ceval_gen_2daf24.py b/configs/datasets/ceval/ceval_gen_2daf24.py index 408cca3b..4e3c5079 100644 --- a/configs/datasets/ceval/ceval_gen_2daf24.py +++ b/configs/datasets/ceval/ceval_gen_2daf24.py @@ -91,7 +91,7 @@ for _split in ['val', 'test']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, diff --git a/configs/datasets/ceval/ceval_gen_5f30c7.py b/configs/datasets/ceval/ceval_gen_5f30c7.py index 75bf7bf1..92c7abcb 100644 --- a/configs/datasets/ceval/ceval_gen_5f30c7.py +++ b/configs/datasets/ceval/ceval_gen_5f30c7.py @@ -5,6 +5,7 @@ from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset from opencompass.utils.text_postprocessors import first_capital_postprocess + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -91,7 +92,7 @@ for _split in ['val']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, diff --git a/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py b/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py index 34cf52eb..1fbff6bb 100644 --- a/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py +++ b/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -93,7 +94,7 @@ for _split in ['val', 'test']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval_internal/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, reader_cfg=ceval_reader_cfg, diff --git a/configs/datasets/ceval/ceval_ppl_1cd8bf.py b/configs/datasets/ceval/ceval_ppl_1cd8bf.py index dd8e19a3..1fbff6bb 100644 --- a/configs/datasets/ceval/ceval_ppl_1cd8bf.py +++ b/configs/datasets/ceval/ceval_ppl_1cd8bf.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -93,7 +94,7 @@ for _split in ['val', 'test']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, reader_cfg=ceval_reader_cfg, diff --git a/configs/datasets/ceval/ceval_ppl_578f8d.py b/configs/datasets/ceval/ceval_ppl_578f8d.py index c337815c..508c6b9c 100644 --- a/configs/datasets/ceval/ceval_ppl_578f8d.py +++ b/configs/datasets/ceval/ceval_ppl_578f8d.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -91,7 +92,7 @@ for _split in ['val']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, diff --git a/configs/datasets/ceval/ceval_ppl_93e5ce.py b/configs/datasets/ceval/ceval_ppl_93e5ce.py index f1869abc..0e17e16b 100644 --- a/configs/datasets/ceval/ceval_ppl_93e5ce.py +++ b/configs/datasets/ceval/ceval_ppl_93e5ce.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -91,7 +92,7 @@ for _split in ['val', 'test']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, diff --git a/configs/datasets/ceval/ceval_zero_shot_gen_bd40ef.py b/configs/datasets/ceval/ceval_zero_shot_gen_bd40ef.py index 5b96dd8f..d1ac8722 100644 --- a/configs/datasets/ceval/ceval_zero_shot_gen_bd40ef.py +++ b/configs/datasets/ceval/ceval_zero_shot_gen_bd40ef.py @@ -5,6 +5,7 @@ from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset from opencompass.utils.text_postprocessors import first_option_postprocess + ceval_subject_mapping = { 'computer_network': ['Computer Network', '计算机网络', 'STEM'], 'operating_system': ['Operating System', '操作系统', 'STEM'], @@ -91,7 +92,7 @@ for _split in ['val']: ceval_datasets.append( dict( type=CEvalDataset, - path='./data/ceval/formal_ceval', + path='opencompass/ceval-exam', name=_name, abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name, diff --git a/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py b/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py index 7aaf9bf9..267d9925 100644 --- a/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py +++ b/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py @@ -33,8 +33,8 @@ maxmin_datasets = [ dict( type=MaxminDataset, abbr=f'maxmin', - test_path=f'data/clozeTest-maxmin/python/clozeTest.json', - answer_path=f'data/clozeTest-maxmin/python/answers.txt', + test_path='opencompass/clozeTest_maxmin', + answer_path='opencompass/clozeTest_maxmin_answers', reader_cfg=maxmin_reader_cfg, infer_cfg=maxmin_infer_cfg, eval_cfg=maxmin_eval_cfg, diff --git a/configs/datasets/cmmlu/cmmlu_gen_c13365.py b/configs/datasets/cmmlu/cmmlu_gen_c13365.py index 3b44c63b..acf3b045 100644 --- a/configs/datasets/cmmlu/cmmlu_gen_c13365.py +++ b/configs/datasets/cmmlu/cmmlu_gen_c13365.py @@ -5,6 +5,7 @@ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator from opencompass.datasets import CMMLUDataset from opencompass.utils.text_postprocessors import first_capital_postprocess + cmmlu_subject_mapping = { 'agronomy': '农学', 'anatomy': '解剖学', @@ -107,7 +108,7 @@ for _name in cmmlu_all_sets: cmmlu_datasets.append( dict( type=CMMLUDataset, - path='./data/cmmlu/', + path='opencompass/cmmlu', name=_name, abbr=f'cmmlu-{_name}', reader_cfg=dict( diff --git a/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py b/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py index 87ce049f..66480897 100644 --- a/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py +++ b/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py @@ -102,7 +102,7 @@ for _name in cmmlu_all_sets: cmmlu_datasets.append( dict( type=CMMLUDataset, - path='./data/cmmlu/', + path='opencompass/cmmlu', name=_name, abbr=f'cmmlu-{_name}', reader_cfg=dict( diff --git a/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py b/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py index 77d37e3d..7bef4b7a 100644 --- a/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py +++ b/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py @@ -107,7 +107,7 @@ for _name in cmmlu_all_sets: cmmlu_datasets.append( dict( type=CMMLUDataset, - path='./data/cmmlu/', + path='opencompass/cmmlu', name=_name, abbr=f'cmmlu-{_name}', reader_cfg=dict( diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py index 145a9ee1..baabf60b 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py @@ -45,7 +45,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg, diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py index 985b2c1e..3a6e5473 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py @@ -52,7 +52,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg, diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py index 3f4fca4c..1d7be03b 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py @@ -47,7 +47,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py index c390762b..ac8fcb1e 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py @@ -42,7 +42,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py index f7bf4cf5..10dec1e9 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py @@ -38,7 +38,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py index fe952b07..e5f202b0 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py @@ -34,7 +34,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py index 526519c9..6f401f91 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py @@ -35,7 +35,7 @@ commonsenseqa_datasets = [ dict( abbr='commonsense_qa', type=commonsenseqaDataset, - path='./data/commonsenseqa', + path='opencompass/commonsense_qa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py b/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py index c3163212..11df7175 100644 --- a/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py +++ b/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py @@ -92,7 +92,7 @@ for _split in list(compassbench_v1_knowledge_sets.keys()): ) -from opencompass.datasets import TriviaQADataset_V3, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV3, TriviaQAEvaluator triviaqa_and_nq_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -123,7 +123,7 @@ triviaqa_and_nq_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_rol compassbench_v1_knowledge_datasets.append( dict( - type=TriviaQADataset_V3, + type=TriviaQADatasetV3, abbr='compassbench_v1_knowledge-mixed-cloze_en', path='data/compassbench_v1.1/knowledge/mixed/cloze_en.jsonl', reader_cfg=triviaqa_and_nq_reader_cfg, diff --git a/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py b/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py index 3c9dd284..b38f3cd7 100644 --- a/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py +++ b/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py @@ -92,7 +92,7 @@ for _split in list(compassbench_v1_knowledge_sets.keys()): ) -from opencompass.datasets import TriviaQADataset_V3, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV3, TriviaQAEvaluator triviaqa_and_nq_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -123,7 +123,7 @@ triviaqa_and_nq_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_rol compassbench_v1_knowledge_datasets.append( dict( - type=TriviaQADataset_V3, + type=TriviaQADatasetV3, abbr='compassbench_v1_knowledge-mixed-cloze_en_public', path='data/compassbench_v1.1.public/knowledge/mixed/cloze_en.jsonl', reader_cfg=triviaqa_and_nq_reader_cfg, diff --git a/configs/datasets/contamination/mbpp_contamination_ppl_f01cb6.py b/configs/datasets/contamination/mbpp_contamination_ppl_f01cb6.py index d1547a0c..40091006 100644 --- a/configs/datasets/contamination/mbpp_contamination_ppl_f01cb6.py +++ b/configs/datasets/contamination/mbpp_contamination_ppl_f01cb6.py @@ -25,7 +25,7 @@ for split in ['train', 'test']: dict( abbr=f'mbpp-{split}-ppl', type=SanitizedMBPPDataset, - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, eval_cfg=mbpp_eval_cfg) diff --git a/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py b/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py index 44072a48..a98473c7 100644 --- a/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py +++ b/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import crowspairsDataset_V2 +from opencompass.datasets import CrowspairsDatasetV2 from opencompass.utils.text_postprocessors import first_capital_postprocess crowspairs_reader_cfg = dict( @@ -32,7 +32,7 @@ crowspairs_eval_cfg = dict( crowspairs_datasets = [ dict( - type=crowspairsDataset_V2, + type=CrowspairsDatasetV2, path='crows_pairs', reader_cfg=crowspairs_reader_cfg, infer_cfg=crowspairs_infer_cfg, diff --git a/configs/datasets/crowspairs/crowspairs_gen_381af0.py b/configs/datasets/crowspairs/crowspairs_gen_381af0.py index 8b0e62b3..ed597400 100644 --- a/configs/datasets/crowspairs/crowspairs_gen_381af0.py +++ b/configs/datasets/crowspairs/crowspairs_gen_381af0.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import (crowspairsDataset_V2, crowspairs_postprocess, +from opencompass.datasets import (CrowspairsDatasetV2, crowspairs_postprocess, CrowspairsEvaluator) crowspairs_reader_cfg = dict( @@ -41,7 +41,7 @@ crowspairs_eval_cfg = dict( crowspairs_datasets = [ dict( abbr='crows_pairs', - type=crowspairsDataset_V2, + type=CrowspairsDatasetV2, path='crows_pairs', reader_cfg=crowspairs_reader_cfg, infer_cfg=crowspairs_infer_cfg, diff --git a/configs/datasets/crowspairs/crowspairs_ppl_47f211.py b/configs/datasets/crowspairs/crowspairs_ppl_47f211.py index a70d3bda..9a21530f 100644 --- a/configs/datasets/crowspairs/crowspairs_ppl_47f211.py +++ b/configs/datasets/crowspairs/crowspairs_ppl_47f211.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import crowspairsDataset +from opencompass.datasets import CrowspairsDataset crowspairs_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], @@ -24,7 +24,7 @@ crowspairs_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) crowspairs_datasets = [ dict( - type=crowspairsDataset, + type=CrowspairsDataset, path='crows_pairs', reader_cfg=crowspairs_reader_cfg, infer_cfg=crowspairs_infer_cfg, diff --git a/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py b/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py index 49f5a312..08809f6b 100644 --- a/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py +++ b/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import crowspairsDataset +from opencompass.datasets import CrowspairsDataset crowspairs_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], @@ -32,7 +32,7 @@ crowspairs_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) crowspairs_datasets = [ dict( - type=crowspairsDataset, + type=CrowspairsDataset, path='crows_pairs', reader_cfg=crowspairs_reader_cfg, infer_cfg=crowspairs_infer_cfg, diff --git a/configs/datasets/crowspairs_cn/crowspairscn_gen_556dc9.py b/configs/datasets/crowspairs_cn/crowspairscn_gen_556dc9.py index b0eacb08..5216363f 100644 --- a/configs/datasets/crowspairs_cn/crowspairscn_gen_556dc9.py +++ b/configs/datasets/crowspairs_cn/crowspairscn_gen_556dc9.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CrowspairsDataset_CN +from opencompass.datasets import CrowspairsDatasetCN from opencompass.utils.text_postprocessors import first_capital_postprocess crowspairscn_reader_cfg = dict( @@ -55,7 +55,7 @@ crowspairscn_eval_cfg = dict( crowspairscn_datasets = [ dict( abbr='crowspairs_cn', - type=CrowspairsDataset_CN, + type=CrowspairsDatasetCN, path='./data/crowspairs_cn/test.jsonl', reader_cfg=crowspairscn_reader_cfg, infer_cfg=crowspairscn_infer_cfg, diff --git a/configs/datasets/crowspairs_cn/crowspairscn_ppl_f53575.py b/configs/datasets/crowspairs_cn/crowspairscn_ppl_f53575.py index d43f932e..8c3213c1 100644 --- a/configs/datasets/crowspairs_cn/crowspairscn_ppl_f53575.py +++ b/configs/datasets/crowspairs_cn/crowspairscn_ppl_f53575.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import CrowspairsDataset_CN +from opencompass.datasets import CrowspairsDatasetCN crowspairscn_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], @@ -30,7 +30,7 @@ crowspairscn_eval_cfg = dict( crowspairscn_datasets = [ dict( abbr='crowspairs_cn', - type=CrowspairsDataset_CN, + type=CrowspairsDatasetCN, path='./data/crowspairs_cn/test.jsonl', reader_cfg=crowspairscn_reader_cfg, infer_cfg=crowspairscn_infer_cfg, diff --git a/configs/datasets/flores/flores_gen_806ede.py b/configs/datasets/flores/flores_gen_806ede.py index 2f1b4e4b..b1e4a155 100644 --- a/configs/datasets/flores/flores_gen_806ede.py +++ b/configs/datasets/flores/flores_gen_806ede.py @@ -154,7 +154,7 @@ for _flores_subtask in _flores_subtasks: dict( abbr=f'flores_100_{_src}-{_tgt}', type=FloresFirst100Dataset, - path='./data/flores_first100', + path='opencompass/flores', name=f'{_flores_source}-{_flores_target}', reader_cfg=flores_reader_cfg.copy(), infer_cfg=flores_infer_cfg.copy(), diff --git a/configs/datasets/flores/flores_gen_aad4fd.py b/configs/datasets/flores/flores_gen_aad4fd.py index 16a60f15..e87fe730 100644 --- a/configs/datasets/flores/flores_gen_aad4fd.py +++ b/configs/datasets/flores/flores_gen_aad4fd.py @@ -147,7 +147,7 @@ for _flores_subtask in _flores_subtasks: dict( abbr=f'flores_100_{_src}-{_tgt}', type=FloresFirst100Dataset, - path='./data/flores_first100', + path='opencompass/flores', name=f'{_flores_source}-{_flores_target}', reader_cfg=flores_reader_cfg.copy(), infer_cfg=flores_infer_cfg.copy(), diff --git a/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py b/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py index a82b01d3..1dbcc1cc 100644 --- a/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py +++ b/configs/datasets/gpqa/gpqa_openai_simple_evals_gen_5aeece.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import GPQADataset_Simple_Eval, GPQA_Simple_Eval_postprocess, GPQAEvaluator +from opencompass.datasets import GPQASimpleEvalDataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator # openai_simple_eval prompt align_prompt = """ @@ -43,7 +43,7 @@ for split in list(gpqa_subsets.keys()): gpqa_datasets.append( dict( abbr='GPQA_' + split, - type=GPQADataset_Simple_Eval, + type=GPQASimpleEvalDataset, path='./data/gpqa/', name=gpqa_subsets[split], reader_cfg=gpqa_reader_cfg, diff --git a/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py index 2bb42d83..ef97be06 100644 --- a/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py +++ b/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py @@ -28,7 +28,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg, diff --git a/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py index d958b88e..c55f8bcf 100644 --- a/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py +++ b/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py @@ -29,7 +29,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg, diff --git a/configs/datasets/gsm8k/gsm8k_agent_gen_c3dff3.py b/configs/datasets/gsm8k/gsm8k_agent_gen_c3dff3.py index f3feba57..c16f76a2 100644 --- a/configs/datasets/gsm8k/gsm8k_agent_gen_c3dff3.py +++ b/configs/datasets/gsm8k/gsm8k_agent_gen_c3dff3.py @@ -47,7 +47,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k-agent', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg, diff --git a/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py index fd448cc8..cfbec9ec 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py +++ b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -32,7 +31,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py index 217f9e24..9897608e 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator + gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -32,7 +33,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py index 293dd70f..64484980 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -79,7 +78,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_3309bd.py b/configs/datasets/gsm8k/gsm8k_gen_3309bd.py index 1c5837c4..85bee57f 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_3309bd.py +++ b/configs/datasets/gsm8k/gsm8k_gen_3309bd.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -32,7 +31,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py b/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py index c052afff..e861026c 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py +++ b/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kAgentEvaluator - # This config is for code interpreter gsm8k_example = """ Example: @@ -77,7 +76,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_701491.py b/configs/datasets/gsm8k/gsm8k_gen_701491.py index 8b00f340..48b68bc8 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_701491.py +++ b/configs/datasets/gsm8k/gsm8k_gen_701491.py @@ -3,7 +3,6 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -26,7 +25,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py index f038028b..467bc16e 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py +++ b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py @@ -3,7 +3,6 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import SCInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' ) generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40) @@ -82,7 +81,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_d6de81.py b/configs/datasets/gsm8k/gsm8k_gen_d6de81.py index 42c04cd0..2c66b7c7 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_d6de81.py +++ b/configs/datasets/gsm8k/gsm8k_gen_d6de81.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -30,7 +29,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py index ab7f7dbf..8dc62013 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py +++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py @@ -7,7 +7,6 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -47,7 +46,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_ee684f.py b/configs/datasets/gsm8k/gsm8k_gen_ee684f.py index 125b229b..37bd9557 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_ee684f.py +++ b/configs/datasets/gsm8k/gsm8k_gen_ee684f.py @@ -2,7 +2,6 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator - gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_infer_cfg = dict( @@ -81,7 +80,7 @@ gsm8k_datasets = [ dict( abbr='gsm8k', type=GSM8KDataset, - path='./data/gsm8k', + path='opencompass/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py b/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py index 5b5136fe..9a12b53b 100644 --- a/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py +++ b/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import hellaswagDatasetwithICE +from opencompass.datasets import HellaswagDatasetwithICE from opencompass.utils.text_postprocessors import first_option_postprocess hellaswag_reader_cfg = dict( @@ -49,8 +49,8 @@ hellaswag_eval_cfg = dict( hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDatasetwithICE, - path='./data/hellaswag/', + type=HellaswagDatasetwithICE, + path='opencompass/hellaswag_ice', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg, diff --git a/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py b/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py index e0654282..61fbb8ae 100644 --- a/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py +++ b/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import hellaswagDatasetwithICE +from opencompass.datasets import HellaswagDatasetwithICE from opencompass.utils.text_postprocessors import first_capital_postprocess hellaswag_reader_cfg = dict( @@ -36,8 +36,8 @@ hellaswag_eval_cfg = dict( hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDatasetwithICE, - path='./data/hellaswag/', + type=HellaswagDatasetwithICE, + path='opencompass/hellaswag_ice', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg, diff --git a/configs/datasets/hellaswag/hellaswag_clean_ppl.py b/configs/datasets/hellaswag/hellaswag_clean_ppl.py index 1eab452c..b4f8942f 100644 --- a/configs/datasets/hellaswag/hellaswag_clean_ppl.py +++ b/configs/datasets/hellaswag/hellaswag_clean_ppl.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccContaminationEvaluator -from opencompass.datasets import hellaswagDatasetClean as hellaswagDataset +from opencompass.datasets import HellaswagDatasetClean as HellaswagDataset hellaswag_reader_cfg = dict( input_columns=['ctx', 'A', 'B', 'C', 'D'], @@ -27,8 +27,8 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py index 94339e89..37d29636 100644 --- a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py +++ b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import hellaswagDataset_V2 +from opencompass.datasets import HellaswagDataset_V2 from opencompass.utils.text_postprocessors import first_option_postprocess hellaswag_reader_cfg = dict( @@ -36,8 +36,8 @@ hellaswag_eval_cfg = dict( hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset_V2, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset_V2, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py b/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py index 9c9474d7..6eeca3c0 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import hellaswagDataset +from opencompass.datasets import HellaswagDataset hellaswag_reader_cfg = dict( input_columns=['ctx', 'A', 'B', 'C', 'D'], @@ -26,8 +26,8 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py b/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py index 58dc2c66..ed1fd337 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import hellaswagDataset_V2 +from opencompass.datasets import HellaswagDataset_V2 hellaswag_reader_cfg = dict( input_columns=['query', 'A', 'B', 'C', 'D'], @@ -25,8 +25,8 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset_V2, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset_V2, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py b/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py index 034980fb..e61f52d9 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import hellaswagDataset +from opencompass.datasets import HellaswagDataset hellaswag_reader_cfg = dict( input_columns=['ctx', 'A', 'B', 'C', 'D'], @@ -26,8 +26,8 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py b/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py index cb2d477a..22223751 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import hellaswagDataset_V3 +from opencompass.datasets import HellaswagDataset_V3 hellaswag_reader_cfg = dict( input_columns=['query', 'A', 'B', 'C', 'D'], @@ -33,8 +33,8 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( abbr='hellaswag', - type=hellaswagDataset_V3, - path='./data/hellaswag/hellaswag.jsonl', + type=HellaswagDataset_V3, + path='opencompass/hellaswag', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py b/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py index 1eac2c69..67dd58a5 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py b/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py index c1f2aca5..830d391f 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py b/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py index 78239204..dc5d10f5 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py b/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py index 07158f0b..69231fdc 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py @@ -26,7 +26,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py b/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py index ff28e89e..ea56afd6 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py @@ -24,7 +24,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py b/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py index 63343c7c..a1be3ba9 100644 --- a/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py @@ -34,7 +34,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_66a7f4.py b/configs/datasets/humaneval/humaneval_gen_66a7f4.py index a5c4ebfd..b4109925 100644 --- a/configs/datasets/humaneval/humaneval_gen_66a7f4.py +++ b/configs/datasets/humaneval/humaneval_gen_66a7f4.py @@ -27,7 +27,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg, diff --git a/configs/datasets/humaneval/humaneval_gen_8e312c.py b/configs/datasets/humaneval/humaneval_gen_8e312c.py index 93d884c9..a8c6e587 100644 --- a/configs/datasets/humaneval/humaneval_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_gen_8e312c.py @@ -30,7 +30,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py b/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py index dd06990e..d364f938 100644 --- a/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py +++ b/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py index d9439c40..6224696f 100644 --- a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval_passk', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py index f2bf918d..adcabde9 100644 --- a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py @@ -29,7 +29,7 @@ humaneval_datasets = [ dict( abbr='openai_humaneval_repeat10', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', num_repeats=10, reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, diff --git a/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py index c8611b5a..c8221cdb 100644 --- a/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py +++ b/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py @@ -29,7 +29,7 @@ humaneval_cn_datasets = [ dict( abbr='openai_humaneval_cn', type=HumanevalDataset, - path='./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', + path='opencompass/humaneval_cn', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py index 1dda06d2..2d2b16a1 100644 --- a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py +++ b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py @@ -29,7 +29,7 @@ humaneval_cn_datasets = [ dict( abbr='openai_humaneval_cn_passk', type=HumanevalDataset, - path='./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', + path='opencompass/humaneval_cn', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py index 34d2c5e4..5eac9eec 100644 --- a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py +++ b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py @@ -29,7 +29,7 @@ humaneval_cn_datasets = [ dict( abbr='openai_humaneval_cn_repeat10', type=HumanevalDataset, - path='./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', + path='opencompass/humaneval_cn', num_repeats=10, reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, diff --git a/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py b/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py index cbe5a8dc..357ef91c 100644 --- a/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py +++ b/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py @@ -27,7 +27,7 @@ humaneval_plus_datasets = [ dict( abbr='humaneval_plus', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_plus_reader_cfg, infer_cfg=humaneval_plus_infer_cfg, eval_cfg=humaneval_plus_eval_cfg, diff --git a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py index de988c09..740d63b2 100644 --- a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py +++ b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py @@ -30,7 +30,7 @@ humaneval_plus_datasets = [ dict( abbr='humaneval_plus', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_plus_reader_cfg, infer_cfg=humaneval_plus_infer_cfg, eval_cfg=humaneval_plus_eval_cfg) diff --git a/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py index f2ba72a3..d602d73b 100644 --- a/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py +++ b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py @@ -29,7 +29,7 @@ humaneval_plus_datasets = [ dict( abbr='humaneval_plus_passk', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', reader_cfg=humaneval_plus_reader_cfg, infer_cfg=humaneval_plus_infer_cfg, eval_cfg=humaneval_plus_eval_cfg) diff --git a/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py index 45b5dd66..bbbafd90 100644 --- a/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py +++ b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py @@ -29,7 +29,7 @@ humaneval_plus_datasets = [ dict( abbr='humaneval_plus_repeat10', type=HumanevalDataset, - path='./data/humaneval/human-eval-v2-20210705.jsonl', + path='opencompass/humaneval', num_repeats=10, reader_cfg=humaneval_plus_reader_cfg, infer_cfg=humaneval_plus_infer_cfg, diff --git a/configs/datasets/lambada/lambada_gen_217e11.py b/configs/datasets/lambada/lambada_gen_217e11.py index 0c125f57..eff0c226 100644 --- a/configs/datasets/lambada/lambada_gen_217e11.py +++ b/configs/datasets/lambada/lambada_gen_217e11.py @@ -26,7 +26,7 @@ lambada_datasets = [ dict( abbr='lambada', type=lambadaDataset, - path='./data/lambada/test.jsonl', + path='opencompass/lambada', reader_cfg=lambada_reader_cfg, infer_cfg=lambada_infer_cfg, eval_cfg=lambada_eval_cfg) diff --git a/configs/datasets/lambada/lambada_gen_8b48a5.py b/configs/datasets/lambada/lambada_gen_8b48a5.py index cd85b152..0ff9e612 100644 --- a/configs/datasets/lambada/lambada_gen_8b48a5.py +++ b/configs/datasets/lambada/lambada_gen_8b48a5.py @@ -22,7 +22,7 @@ lambada_datasets = [ dict( abbr='lambada', type=lambadaDataset, - path='./data/lambada/test.jsonl', + path='opencompass/lambada', reader_cfg=lambada_reader_cfg, infer_cfg=lambada_infer_cfg, eval_cfg=lambada_eval_cfg) diff --git a/configs/datasets/lcsts/lcsts_gen_8ee1fe.py b/configs/datasets/lcsts/lcsts_gen_8ee1fe.py index 5312de7b..fb02f42f 100644 --- a/configs/datasets/lcsts/lcsts_gen_8ee1fe.py +++ b/configs/datasets/lcsts/lcsts_gen_8ee1fe.py @@ -25,7 +25,7 @@ lcsts_datasets = [ dict( type=LCSTSDataset, abbr='lcsts', - path='./data/LCSTS', + path='opencompass/LCSTS', reader_cfg=lcsts_reader_cfg, infer_cfg=lcsts_infer_cfg, eval_cfg=lcsts_eval_cfg) diff --git a/configs/datasets/lcsts/lcsts_gen_9b0b89.py b/configs/datasets/lcsts/lcsts_gen_9b0b89.py index 4ba441ae..5171ca25 100644 --- a/configs/datasets/lcsts/lcsts_gen_9b0b89.py +++ b/configs/datasets/lcsts/lcsts_gen_9b0b89.py @@ -21,7 +21,7 @@ lcsts_datasets = [ dict( type=LCSTSDataset, abbr='lcsts', - path='./data/LCSTS', + path='opencompass/LCSTS', reader_cfg=lcsts_reader_cfg, infer_cfg=lcsts_infer_cfg, eval_cfg=lcsts_eval_cfg) diff --git a/configs/datasets/math/math_0shot_gen_393424.py b/configs/datasets/math/math_0shot_gen_393424.py index 25f7436b..d2fef53c 100644 --- a/configs/datasets/math/math_0shot_gen_393424.py +++ b/configs/datasets/math/math_0shot_gen_393424.py @@ -27,7 +27,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, diff --git a/configs/datasets/math/math_4shot_base_gen_db136b.py b/configs/datasets/math/math_4shot_base_gen_db136b.py index 16883f37..95dd620e 100644 --- a/configs/datasets/math/math_4shot_base_gen_db136b.py +++ b/configs/datasets/math/math_4shot_base_gen_db136b.py @@ -23,7 +23,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py b/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py index c756f0f4..996f9c68 100644 --- a/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py +++ b/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.datasets import ( MATHDataset, MATHAgentEvaluator, math_postprocess_v2 ) + # use pal format but not perform well math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -91,7 +92,7 @@ math_datasets = [ dict( abbr='math-agent', type=MATHDataset, - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, diff --git a/configs/datasets/math/math_agent_gen_0c1b4e.py b/configs/datasets/math/math_agent_gen_0c1b4e.py index 5d8fc5c1..0abbc0c0 100644 --- a/configs/datasets/math/math_agent_gen_0c1b4e.py +++ b/configs/datasets/math/math_agent_gen_0c1b4e.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.datasets import ( MATHDataset, MATHAgentEvaluator, math_postprocess ) + # use pal format but not perform well math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -90,7 +91,7 @@ math_datasets = [ dict( abbr='math-agent', type=MATHDataset, - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, diff --git a/configs/datasets/math/math_agent_gen_861b4f.py b/configs/datasets/math/math_agent_gen_861b4f.py index 33cf661c..ad1a7272 100644 --- a/configs/datasets/math/math_agent_gen_861b4f.py +++ b/configs/datasets/math/math_agent_gen_861b4f.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.datasets import ( MATHDataset, MATHAgentEvaluator, math_postprocess ) + # use pal format but not perform well math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -81,7 +82,7 @@ math_datasets = [ dict( abbr='math-agent', type=MATHDataset, - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, diff --git a/configs/datasets/math/math_agent_gen_af2293.py b/configs/datasets/math/math_agent_gen_af2293.py index 2d1f4bf4..51b3500b 100644 --- a/configs/datasets/math/math_agent_gen_af2293.py +++ b/configs/datasets/math/math_agent_gen_af2293.py @@ -4,6 +4,7 @@ from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.datasets import ( MATHDataset, MATHAgentEvaluator, math_postprocess ) + # use pal format but not perform well math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -94,7 +95,7 @@ math_datasets = [ dict( abbr='math-agent', type=MATHDataset, - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, diff --git a/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py index ca9b9b90..6405513a 100644 --- a/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py +++ b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py @@ -49,7 +49,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_evaluatorv2_gen_cecb31.py b/configs/datasets/math/math_evaluatorv2_gen_cecb31.py index 6060a71c..6ca42154 100644 --- a/configs/datasets/math/math_evaluatorv2_gen_cecb31.py +++ b/configs/datasets/math/math_evaluatorv2_gen_cecb31.py @@ -31,7 +31,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_0957ff.py b/configs/datasets/math/math_gen_0957ff.py index cafdd996..c112f035 100644 --- a/configs/datasets/math/math_gen_0957ff.py +++ b/configs/datasets/math/math_gen_0957ff.py @@ -29,7 +29,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_1ed9c2.py b/configs/datasets/math/math_gen_1ed9c2.py index 472cf41e..a168d511 100644 --- a/configs/datasets/math/math_gen_1ed9c2.py +++ b/configs/datasets/math/math_gen_1ed9c2.py @@ -29,7 +29,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_265cce.py b/configs/datasets/math/math_gen_265cce.py index ab94e50d..ef2a6af5 100644 --- a/configs/datasets/math/math_gen_265cce.py +++ b/configs/datasets/math/math_gen_265cce.py @@ -29,7 +29,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_559593.py b/configs/datasets/math/math_gen_559593.py index 3fbdadbd..18da0028 100644 --- a/configs/datasets/math/math_gen_559593.py +++ b/configs/datasets/math/math_gen_559593.py @@ -46,7 +46,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_5e8458.py b/configs/datasets/math/math_gen_5e8458.py index 9a6bf866..ed9c3e5f 100644 --- a/configs/datasets/math/math_gen_5e8458.py +++ b/configs/datasets/math/math_gen_5e8458.py @@ -43,7 +43,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=dict( input_columns=['problem'], output_column='solution', diff --git a/configs/datasets/math/math_gen_736506.py b/configs/datasets/math/math_gen_736506.py index dc2127e5..6f0b1de7 100644 --- a/configs/datasets/math/math_gen_736506.py +++ b/configs/datasets/math/math_gen_736506.py @@ -21,7 +21,7 @@ math_datasets = [ dict( type=MATHInternDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_78ced2.py b/configs/datasets/math/math_gen_78ced2.py index 7341f09d..9088b975 100644 --- a/configs/datasets/math/math_gen_78ced2.py +++ b/configs/datasets/math/math_gen_78ced2.py @@ -30,7 +30,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_gen_943d32.py b/configs/datasets/math/math_gen_943d32.py index cd7b9be9..ee22ba48 100644 --- a/configs/datasets/math/math_gen_943d32.py +++ b/configs/datasets/math/math_gen_943d32.py @@ -54,7 +54,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=dict( input_columns=['problem'], output_column='solution', diff --git a/configs/datasets/math/math_intern_evaluator_gen_265cce.py b/configs/datasets/math/math_intern_evaluator_gen_265cce.py index be8fcc51..1a6cbf37 100644 --- a/configs/datasets/math/math_intern_evaluator_gen_265cce.py +++ b/configs/datasets/math/math_intern_evaluator_gen_265cce.py @@ -30,7 +30,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/math/math_llm_judge.py b/configs/datasets/math/math_llm_judge.py index ea6fb90f..6a81bea2 100644 --- a/configs/datasets/math/math_llm_judge.py +++ b/configs/datasets/math/math_llm_judge.py @@ -28,7 +28,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py index 2bed2168..c6dcbe5a 100644 --- a/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py +++ b/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') @@ -32,7 +32,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_passk', path='./data/mbpp/mbpp.jsonl', reader_cfg=mbpp_reader_cfg, diff --git a/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py index ad461637..99263c9c 100644 --- a/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py +++ b/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') @@ -34,7 +34,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_repeat10', path='./data/mbpp/mbpp.jsonl', num_repeats=10, diff --git a/configs/datasets/mbpp/mbpp_gen_830460.py b/configs/datasets/mbpp/mbpp_gen_830460.py index e1087305..693b7ff3 100644 --- a/configs/datasets/mbpp/mbpp_gen_830460.py +++ b/configs/datasets/mbpp/mbpp_gen_830460.py @@ -34,7 +34,7 @@ mbpp_datasets = [ dict( type=MBPPDataset, abbr='mbpp', - path='./data/mbpp/mbpp.jsonl', + path='opencompass/mbpp', reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, eval_cfg=mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/mbpp_passk_gen_830460.py b/configs/datasets/mbpp/mbpp_passk_gen_830460.py index 16fc1232..af5a1057 100644 --- a/configs/datasets/mbpp/mbpp_passk_gen_830460.py +++ b/configs/datasets/mbpp/mbpp_passk_gen_830460.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') @@ -32,9 +32,9 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_passk', - path='./data/mbpp/mbpp.jsonl', + path='opencompass/mbpp', reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, eval_cfg=mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py b/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py index b8a2c5a3..643022e3 100644 --- a/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py +++ b/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') @@ -34,9 +34,9 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_repeat10', - path='./data/mbpp/mbpp.jsonl', + path='opencompass/mbpp', num_repeats=10, reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py index 5ed9f457..fbd24dbb 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py @@ -74,7 +74,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, eval_cfg=sanitized_mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py index 1a44fa63..1ff90404 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py @@ -34,7 +34,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, eval_cfg=sanitized_mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py index 48c12835..4890c3f0 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py @@ -33,7 +33,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, eval_cfg=sanitized_mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py b/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py index 957f793e..a67e0e5e 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py +++ b/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py @@ -33,7 +33,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, eval_cfg=sanitized_mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py index a394885b..477e8d6a 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py +++ b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py @@ -34,7 +34,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp_passk', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, eval_cfg=sanitized_mbpp_eval_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py index 425871e4..28fb62f6 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py +++ b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py @@ -34,7 +34,7 @@ sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, abbr='sanitized_mbpp_repeat10', - path='./data/mbpp/sanitized-mbpp.jsonl', + path='opencompass/sanitized_mbpp', num_repeats=10, reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, diff --git a/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py index 9ecbc35c..fa4ad7d8 100644 --- a/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py +++ b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict( input_columns=['text', 'test_list'], output_column='test_column') @@ -55,7 +55,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_cn_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_cn_passk', path='./data/mbpp_cn/mbpp_cn.jsonl', reader_cfg=mbpp_reader_cfg, diff --git a/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py index d4667ef2..9f115493 100644 --- a/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py +++ b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator mbpp_reader_cfg = dict( input_columns=['text', 'test_list'], output_column='test_column') @@ -55,7 +55,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_cn_datasets = [ dict( - type=MBPPDataset_V2, + type=MBPPDatasetV2, abbr='mbpp_cn_repeat10', path='./data/mbpp_cn/mbpp_cn.jsonl', num_repeats=10, diff --git a/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py b/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py index 56e77c13..b6466f57 100644 --- a/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py +++ b/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py @@ -58,6 +58,7 @@ mbpp_cn_datasets = [ type=MBPPDataset, abbr='mbpp_cn', path='./data/mbpp_cn/mbpp_cn.jsonl', + local_mode=True, reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, eval_cfg=mbpp_eval_cfg) diff --git a/configs/datasets/mmlu/mmlu_clean_ppl.py b/configs/datasets/mmlu/mmlu_clean_ppl.py index f9a5cb90..52d23e29 100644 --- a/configs/datasets/mmlu/mmlu_clean_ppl.py +++ b/configs/datasets/mmlu/mmlu_clean_ppl.py @@ -104,7 +104,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_gen_23a9a9.py b/configs/datasets/mmlu/mmlu_gen_23a9a9.py index 322eef85..71e3eba9 100644 --- a/configs/datasets/mmlu/mmlu_gen_23a9a9.py +++ b/configs/datasets/mmlu/mmlu_gen_23a9a9.py @@ -102,7 +102,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg.copy(), diff --git a/configs/datasets/mmlu/mmlu_gen_4d595a.py b/configs/datasets/mmlu/mmlu_gen_4d595a.py index 0e40bf3c..36c0a21a 100644 --- a/configs/datasets/mmlu/mmlu_gen_4d595a.py +++ b/configs/datasets/mmlu/mmlu_gen_4d595a.py @@ -113,7 +113,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_gen_5d1409.py b/configs/datasets/mmlu/mmlu_gen_5d1409.py index 83709b67..73bd3895 100644 --- a/configs/datasets/mmlu/mmlu_gen_5d1409.py +++ b/configs/datasets/mmlu/mmlu_gen_5d1409.py @@ -114,7 +114,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_gen_79e572.py b/configs/datasets/mmlu/mmlu_gen_79e572.py index a6c56dd6..0ae6a0e4 100644 --- a/configs/datasets/mmlu/mmlu_gen_79e572.py +++ b/configs/datasets/mmlu/mmlu_gen_79e572.py @@ -100,7 +100,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_gen_a484b3.py b/configs/datasets/mmlu/mmlu_gen_a484b3.py index c52027ce..76a4f9b4 100644 --- a/configs/datasets/mmlu/mmlu_gen_a484b3.py +++ b/configs/datasets/mmlu/mmlu_gen_a484b3.py @@ -114,7 +114,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py b/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py index 1121469b..b6b45d2f 100644 --- a/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py +++ b/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py @@ -7,7 +7,7 @@ from opencompass.datasets import MMLUDataset from opencompass.utils.text_postprocessors import match_answer_pattern with read_base(): - from .mmlu_all_sets import mmlu_all_sets + from .mmlu_all_sets import mmlu_all_sets # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -51,7 +51,7 @@ for name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_ppl_ac766d.py b/configs/datasets/mmlu/mmlu_ppl_ac766d.py index ab242c6f..ea39d57d 100644 --- a/configs/datasets/mmlu/mmlu_ppl_ac766d.py +++ b/configs/datasets/mmlu/mmlu_ppl_ac766d.py @@ -96,7 +96,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py b/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py index 693fd663..b8096e8c 100644 --- a/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py +++ b/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py @@ -115,7 +115,7 @@ for _name in mmlu_all_sets: dict( abbr=f'lukaemon_mmlu_{_name}', type=MMLUDataset, - path='./data/mmlu/', + path='opencompass/mmlu', name=_name, reader_cfg=mmlu_reader_cfg, infer_cfg=mmlu_infer_cfg, diff --git a/configs/datasets/nq/nq_gen_0356ec.py b/configs/datasets/nq/nq_gen_0356ec.py index 7a4d4480..e5995b26 100644 --- a/configs/datasets/nq/nq_gen_0356ec.py +++ b/configs/datasets/nq/nq_gen_0356ec.py @@ -54,7 +54,7 @@ for k in [0, 1, 5]: dict( type=NaturalQuestionDataset, abbr='nq' if k == 0 else f'nq_{k}shot', - path='./data/nq/', + path='opencompass/natural_question', reader_cfg=nq_reader_cfg, infer_cfg=nq_infer_cfg, eval_cfg=nq_eval_cfg) diff --git a/configs/datasets/nq/nq_gen_2463e2.py b/configs/datasets/nq/nq_gen_2463e2.py index 2a206015..0ae92d54 100644 --- a/configs/datasets/nq/nq_gen_2463e2.py +++ b/configs/datasets/nq/nq_gen_2463e2.py @@ -20,7 +20,7 @@ nq_datasets = [ dict( type=NaturalQuestionDataset, abbr='nq', - path='./data/nq/', + path='opencompass/natural_question', reader_cfg=nq_reader_cfg, infer_cfg=nq_infer_cfg, eval_cfg=nq_eval_cfg) diff --git a/configs/datasets/nq/nq_gen_3dcea1.py b/configs/datasets/nq/nq_gen_3dcea1.py index d438a616..a6793ab0 100644 --- a/configs/datasets/nq/nq_gen_3dcea1.py +++ b/configs/datasets/nq/nq_gen_3dcea1.py @@ -22,7 +22,7 @@ nq_datasets = [ dict( type=NaturalQuestionDataset, abbr='nq', - path='./data/nq/', + path='opencompass/natural_question', reader_cfg=nq_reader_cfg, infer_cfg=nq_infer_cfg, eval_cfg=nq_eval_cfg) diff --git a/configs/datasets/nq/nq_gen_68c1c6.py b/configs/datasets/nq/nq_gen_68c1c6.py index 515fa387..dacaa131 100644 --- a/configs/datasets/nq/nq_gen_68c1c6.py +++ b/configs/datasets/nq/nq_gen_68c1c6.py @@ -23,7 +23,7 @@ nq_datasets = [ dict( type=NaturalQuestionDataset, abbr='nq', - path='./data/nq/', + path='opencompass/natural_question', reader_cfg=nq_reader_cfg, infer_cfg=nq_infer_cfg, eval_cfg=nq_eval_cfg) diff --git a/configs/datasets/nq/nq_gen_c788f6.py b/configs/datasets/nq/nq_gen_c788f6.py index c3c1a391..adbdcea5 100644 --- a/configs/datasets/nq/nq_gen_c788f6.py +++ b/configs/datasets/nq/nq_gen_c788f6.py @@ -23,7 +23,7 @@ nq_datasets = [ dict( type=NaturalQuestionDataset, abbr='nq', - path='./data/nq/', + path='opencompass/natural_question', reader_cfg=nq_reader_cfg, infer_cfg=nq_infer_cfg, eval_cfg=nq_eval_cfg) diff --git a/configs/datasets/nq_cn/nqcn_gen_141737.py b/configs/datasets/nq_cn/nqcn_gen_141737.py index a3450ee5..19a3d456 100644 --- a/configs/datasets/nq_cn/nqcn_gen_141737.py +++ b/configs/datasets/nq_cn/nqcn_gen_141737.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import NaturalQuestionDataset_CN, NQEvaluator_CN +from opencompass.datasets import NaturalQuestionDatasetCN, NQEvaluatorCN nqcn_reader_cfg = dict( input_columns=['question'], output_column='answer', train_split='test' @@ -20,12 +20,12 @@ nqcn_infer_cfg = dict( inferencer=dict(type=GenInferencer), ) -nqcn_eval_cfg = dict(evaluator=dict(type=NQEvaluator_CN), pred_role='BOT') +nqcn_eval_cfg = dict(evaluator=dict(type=NQEvaluatorCN), pred_role='BOT') nqcn_datasets = [ dict( abbr='nq_cn', - type=NaturalQuestionDataset_CN, + type=NaturalQuestionDatasetCN, path='./data/nq_cn', reader_cfg=nqcn_reader_cfg, infer_cfg=nqcn_infer_cfg, diff --git a/configs/datasets/obqa/obqa_gen_9069e4.py b/configs/datasets/obqa/obqa_gen_9069e4.py index b008a3da..9cc86048 100644 --- a/configs/datasets/obqa/obqa_gen_9069e4.py +++ b/configs/datasets/obqa/obqa_gen_9069e4.py @@ -32,12 +32,14 @@ obqa_datasets = [ dict( abbr='openbookqa', type=OBQADataset, - path='./data/openbookqa/Main/test.jsonl', + path='opencompass/openbookqa_test', + name='main', ), dict( abbr='openbookqa_fact', type=OBQADataset, - path='./data/openbookqa/Additional/test_complete.jsonl', + path='opencompass/openbookqa_fact', + name='additional', ), ] diff --git a/configs/datasets/obqa/obqa_ppl_1defe8.py b/configs/datasets/obqa/obqa_ppl_1defe8.py index 63cf9bb1..168290a3 100644 --- a/configs/datasets/obqa/obqa_ppl_1defe8.py +++ b/configs/datasets/obqa/obqa_ppl_1defe8.py @@ -24,12 +24,14 @@ obqa_datasets = [ dict( abbr='openbookqa', type=OBQADataset, - path='./data/openbookqa/Main/test.jsonl', + path='opencompass/openbookqa_test', + name='main', ), dict( abbr='openbookqa_fact', type=OBQADataset, - path='./data/openbookqa/Additional/test_complete.jsonl', + path='opencompass/openbookqa_fact', + name='additional', ), ] for _i in range(2): diff --git a/configs/datasets/obqa/obqa_ppl_6aac9e.py b/configs/datasets/obqa/obqa_ppl_6aac9e.py index d25fe2dc..3dfd4a74 100644 --- a/configs/datasets/obqa/obqa_ppl_6aac9e.py +++ b/configs/datasets/obqa/obqa_ppl_6aac9e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import OBQADataset_V2 +from opencompass.datasets import OBQADatasetV2 obqa_reader_cfg = dict( input_columns=['question_stem', 'A', 'B', 'C', 'D', 'fact1'], @@ -32,8 +32,9 @@ obqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) obqa_datasets = [ dict( abbr='openbookqa_fact', - type=OBQADataset_V2, - path='./data/openbookqa/Additional/test_complete.jsonl', + type=OBQADatasetV2, + path='opencompass/openbookqa_fact', + name='additional', reader_cfg=obqa_reader_cfg, infer_cfg=obqa_infer_cfg, eval_cfg=obqa_eval_cfg, diff --git a/configs/datasets/obqa/obqa_ppl_c7c154.py b/configs/datasets/obqa/obqa_ppl_c7c154.py index d67c717e..ffbbf922 100644 --- a/configs/datasets/obqa/obqa_ppl_c7c154.py +++ b/configs/datasets/obqa/obqa_ppl_c7c154.py @@ -39,12 +39,14 @@ obqa_datasets = [ dict( abbr='openbookqa', type=OBQADataset, - path='./data/openbookqa/Main/test.jsonl', + path='opencompass/openbookqa_test', + name='main', ), dict( abbr='openbookqa_fact', type=OBQADataset, - path='./data/openbookqa/Additional/test_complete.jsonl', + path='opencompass/openbookqa_fact', + name='additional', ), ] for _i in range(2): diff --git a/configs/datasets/piqa/piqa_gen_1194eb.py b/configs/datasets/piqa/piqa_gen_1194eb.py index 10fc1b55..b6782399 100644 --- a/configs/datasets/piqa/piqa_gen_1194eb.py +++ b/configs/datasets/piqa/piqa_gen_1194eb.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import piqaDataset_V2 +from opencompass.datasets import PIQADatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess piqa_reader_cfg = dict( @@ -33,8 +33,8 @@ piqa_eval_cfg = dict( piqa_datasets = [ dict( abbr='piqa', - type=piqaDataset_V2, - path='./data/piqa', + type=PIQADatasetV2, + path='opencompass/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_0cfff2.py b/configs/datasets/piqa/piqa_ppl_0cfff2.py index 0dd7a0a2..8be30a93 100644 --- a/configs/datasets/piqa/piqa_ppl_0cfff2.py +++ b/configs/datasets/piqa/piqa_ppl_0cfff2.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import piqaDataset_V3 +from opencompass.datasets import PIQADatasetV3 piqa_reader_cfg = dict( input_columns=['goal', 'sol1', 'sol2'], @@ -29,8 +29,8 @@ piqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) piqa_datasets = [ dict( abbr='piqa', - type=piqaDataset_V3, - path='./data/piqa', + type=PIQADatasetV3, + path='opencompass/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_1cf9f0.py b/configs/datasets/piqa/piqa_ppl_1cf9f0.py index bb2a0f3b..abbe285a 100644 --- a/configs/datasets/piqa/piqa_ppl_1cf9f0.py +++ b/configs/datasets/piqa/piqa_ppl_1cf9f0.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import piqaDataset +from opencompass.datasets import PIQADataset piqa_reader_cfg = dict( input_columns=['goal', 'sol1', 'sol2'], @@ -24,8 +24,8 @@ piqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) piqa_datasets = [ dict( abbr='piqa', - type=piqaDataset, - path='./data/piqa', + type=PIQADataset, + path='opencompass/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_3431ea.py b/configs/datasets/piqa/piqa_ppl_3431ea.py index 7da5655d..ae9e25a7 100644 --- a/configs/datasets/piqa/piqa_ppl_3431ea.py +++ b/configs/datasets/piqa/piqa_ppl_3431ea.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import piqaDataset +from opencompass.datasets import PIQADataset piqa_reader_cfg = dict( input_columns=['goal', 'sol1', 'sol2'], @@ -34,8 +34,8 @@ piqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) piqa_datasets = [ dict( abbr='piqa', - type=piqaDataset, - path='./data/piqa', + type=PIQADataset, + path='opencompass/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/promptbench/promptbench_math_gen_abf776.py b/configs/datasets/promptbench/promptbench_math_gen_abf776.py index 8511e06c..034426a4 100644 --- a/configs/datasets/promptbench/promptbench_math_gen_abf776.py +++ b/configs/datasets/promptbench/promptbench_math_gen_abf776.py @@ -37,7 +37,7 @@ math_datasets = [ dict( type=MATHDataset, abbr='math', - path='./data/math/math.json', + path='opencompass/math', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg) diff --git a/configs/datasets/py150/py150_gen_38b13d.py b/configs/datasets/py150/py150_gen_38b13d.py index 17058167..85ec5b5f 100644 --- a/configs/datasets/py150/py150_gen_38b13d.py +++ b/configs/datasets/py150/py150_gen_38b13d.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import Py150Dataset + from opencompass.utils.text_postprocessors import first_capital_postprocess diff --git a/configs/datasets/race/race_gen_69ee4f.py b/configs/datasets/race/race_gen_69ee4f.py index 50f35ea0..64a87ee6 100644 --- a/configs/datasets/race/race_gen_69ee4f.py +++ b/configs/datasets/race/race_gen_69ee4f.py @@ -34,7 +34,7 @@ race_datasets = [ dict( abbr='race-middle', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, @@ -42,7 +42,7 @@ race_datasets = [ dict( abbr='race-high', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_gen_9302a5.py b/configs/datasets/race/race_gen_9302a5.py index f267a3ca..b00ece22 100644 --- a/configs/datasets/race/race_gen_9302a5.py +++ b/configs/datasets/race/race_gen_9302a5.py @@ -28,7 +28,7 @@ race_datasets = [ dict( abbr='race-middle', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, @@ -36,7 +36,7 @@ race_datasets = [ dict( abbr='race-high', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_5831a0.py b/configs/datasets/race/race_ppl_5831a0.py index 54e9c52f..e65cde7d 100644 --- a/configs/datasets/race/race_ppl_5831a0.py +++ b/configs/datasets/race/race_ppl_5831a0.py @@ -32,7 +32,7 @@ race_datasets = [ dict( abbr='race-middle', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, @@ -40,7 +40,7 @@ race_datasets = [ dict( abbr='race-high', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_a138cd.py b/configs/datasets/race/race_ppl_a138cd.py index 1e4e37f6..032bee61 100644 --- a/configs/datasets/race/race_ppl_a138cd.py +++ b/configs/datasets/race/race_ppl_a138cd.py @@ -34,7 +34,7 @@ race_datasets = [ dict( abbr='race-middle', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, @@ -42,7 +42,7 @@ race_datasets = [ dict( abbr='race-high', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_abed12.py b/configs/datasets/race/race_ppl_abed12.py index 637eab0d..ba35d930 100644 --- a/configs/datasets/race/race_ppl_abed12.py +++ b/configs/datasets/race/race_ppl_abed12.py @@ -26,7 +26,7 @@ race_datasets = [ dict( abbr='race-middle', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, @@ -34,7 +34,7 @@ race_datasets = [ dict( abbr='race-high', type=RaceDataset, - path='./data/race', + path='opencompass/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/s3eval/s3eval_gen.py b/configs/datasets/s3eval/s3eval_gen.py index bbf70a71..32dcadca 100644 --- a/configs/datasets/s3eval/s3eval_gen.py +++ b/configs/datasets/s3eval/s3eval_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .s3eval_gen_370cc2 import s3eval_datasets # noqa: F401, F40 + from .s3eval_gen_b8ac80 import s3eval_datasets # noqa: F401, F40 diff --git a/configs/datasets/s3eval/s3eval_gen_370cc2.py b/configs/datasets/s3eval/s3eval_gen_b8ac80.py similarity index 86% rename from configs/datasets/s3eval/s3eval_gen_370cc2.py rename to configs/datasets/s3eval/s3eval_gen_b8ac80.py index 65c9c1d9..ec209d11 100644 --- a/configs/datasets/s3eval/s3eval_gen_370cc2.py +++ b/configs/datasets/s3eval/s3eval_gen_b8ac80.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import S3EvalDataset, S3EvalEvaluator +from opencompass.datasets.s3eval import S3EvalDataset, S3EvalEvaluator s3eval_cfg = dict(evaluator=dict(type=S3EvalEvaluator)) diff --git a/configs/datasets/siqa/siqa_gen_18632c.py b/configs/datasets/siqa/siqa_gen_18632c.py index b253ef6e..a763acb8 100644 --- a/configs/datasets/siqa/siqa_gen_18632c.py +++ b/configs/datasets/siqa/siqa_gen_18632c.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.utils.text_postprocessors import first_option_postprocess -from opencompass.datasets import siqaDataset_V3 +from opencompass.datasets import SiqaDatasetV3 siqa_reader_cfg = dict( input_columns=['context', 'question', 'A', 'B', 'C'], @@ -34,8 +34,8 @@ siqa_eval_cfg = dict( siqa_datasets = [ dict( abbr='siqa', - type=siqaDataset_V3, - path='./data/siqa', + type=SiqaDatasetV3, + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_gen_e78df3.py b/configs/datasets/siqa/siqa_gen_e78df3.py index e7fed879..d891a904 100644 --- a/configs/datasets/siqa/siqa_gen_e78df3.py +++ b/configs/datasets/siqa/siqa_gen_e78df3.py @@ -34,7 +34,7 @@ siqa_datasets = [ dict( abbr='siqa', type=siqaDataset_V2, - path='./data/siqa', + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_42bc6e.py b/configs/datasets/siqa/siqa_ppl_42bc6e.py index 15c67d36..356fdad0 100644 --- a/configs/datasets/siqa/siqa_ppl_42bc6e.py +++ b/configs/datasets/siqa/siqa_ppl_42bc6e.py @@ -26,7 +26,7 @@ siqa_datasets = [ dict( abbr='siqa', type=siqaDataset, - path='./data/siqa', + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_7845b0.py b/configs/datasets/siqa/siqa_ppl_7845b0.py index b4d03d82..d8843e0c 100644 --- a/configs/datasets/siqa/siqa_ppl_7845b0.py +++ b/configs/datasets/siqa/siqa_ppl_7845b0.py @@ -26,7 +26,7 @@ siqa_datasets = [ dict( abbr='siqa', type=siqaDataset, - path='./data/siqa', + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_ced5f6.py b/configs/datasets/siqa/siqa_ppl_ced5f6.py index 99499399..27db38d0 100644 --- a/configs/datasets/siqa/siqa_ppl_ced5f6.py +++ b/configs/datasets/siqa/siqa_ppl_ced5f6.py @@ -38,7 +38,7 @@ siqa_datasets = [ dict( abbr='siqa', type=siqaDataset, - path='./data/siqa', + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_e8d8c5.py b/configs/datasets/siqa/siqa_ppl_e8d8c5.py index 395b5f5a..9126df75 100644 --- a/configs/datasets/siqa/siqa_ppl_e8d8c5.py +++ b/configs/datasets/siqa/siqa_ppl_e8d8c5.py @@ -38,7 +38,7 @@ siqa_datasets = [ dict( abbr='siqa', type=siqaDataset, - path='./data/siqa', + path='opencompass/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/storycloze/storycloze_gen_7f656a.py b/configs/datasets/storycloze/storycloze_gen_7f656a.py index beb42c88..e63b2085 100644 --- a/configs/datasets/storycloze/storycloze_gen_7f656a.py +++ b/configs/datasets/storycloze/storycloze_gen_7f656a.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import storyclozeDataset_V2 +from opencompass.datasets import StoryClozeDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess storycloze_reader_cfg = dict( @@ -36,8 +36,8 @@ storycloze_eval_cfg = dict( storycloze_datasets = [ dict( abbr='story_cloze', - type=storyclozeDataset_V2, - path='./data/xstory_cloze', + type=StoryClozeDatasetV2, + path='opencompass/xstory_cloze', lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, diff --git a/configs/datasets/storycloze/storycloze_ppl_496661.py b/configs/datasets/storycloze/storycloze_ppl_496661.py index 65b24094..c20591bf 100644 --- a/configs/datasets/storycloze/storycloze_ppl_496661.py +++ b/configs/datasets/storycloze/storycloze_ppl_496661.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import storyclozeDataset +from opencompass.datasets import StoryClozeDataset storycloze_reader_cfg = dict( input_columns=['context', 'sentence_quiz1', 'sentence_quiz2'], @@ -30,8 +30,8 @@ storycloze_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) storycloze_datasets = [ dict( abbr='story_cloze', - type=storyclozeDataset, - path='./data/xstory_cloze', + type=StoryClozeDataset, + path='opencompass/xstory_cloze', lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, diff --git a/configs/datasets/storycloze/storycloze_ppl_afd16f.py b/configs/datasets/storycloze/storycloze_ppl_afd16f.py index c7167f63..1c9420b0 100644 --- a/configs/datasets/storycloze/storycloze_ppl_afd16f.py +++ b/configs/datasets/storycloze/storycloze_ppl_afd16f.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import storyclozeDataset +from opencompass.datasets import StoryClozeDataset storycloze_reader_cfg = dict( input_columns=['context', 'sentence_quiz1', 'sentence_quiz2'], @@ -27,8 +27,8 @@ storycloze_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) storycloze_datasets = [ dict( abbr='story_cloze', - type=storyclozeDataset, - path='./data/xstory_cloze', + type=StoryClozeDataset, + path='opencompass/xstory_cloze', lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, diff --git a/configs/datasets/strategyqa/strategyqa_gen_1180a7.py b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py index 2eb96593..4925468a 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_1180a7.py +++ b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py @@ -87,7 +87,7 @@ strategyqa_datasets = [ dict( abbr='strategyqa', type=StrategyQADataset, - path='./data/strategyqa/strategyQA_train.json', + path='opencompass/strategy_qa', reader_cfg=strategyqa_reader_cfg, infer_cfg=strategyqa_infer_cfg, eval_cfg=strategyqa_eval_cfg) diff --git a/configs/datasets/strategyqa/strategyqa_gen_934441.py b/configs/datasets/strategyqa/strategyqa_gen_934441.py index fa6270df..06542fc2 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_934441.py +++ b/configs/datasets/strategyqa/strategyqa_gen_934441.py @@ -51,7 +51,7 @@ strategyqa_datasets = [ dict( abbr='strategyqa', type=StrategyQADataset, - path='./data/strategyqa/strategyQA_train.json', + path='opencompass/strategy_qa', reader_cfg=strategyqa_reader_cfg, infer_cfg=strategyqa_infer_cfg, eval_cfg=strategyqa_eval_cfg) diff --git a/configs/datasets/summedits/summedits_gen_315438.py b/configs/datasets/summedits/summedits_gen_315438.py index e9e2d92c..f02a38ff 100644 --- a/configs/datasets/summedits/summedits_gen_315438.py +++ b/configs/datasets/summedits/summedits_gen_315438.py @@ -44,7 +44,7 @@ summedits_datasets = [ dict( abbr='summedits', type=SummeditsDataset_V2, - path='./data/summedits/summedits.jsonl', + path='opencompass/summedits', reader_cfg=summedits_reader_cfg, infer_cfg=summedits_infer_cfg, eval_cfg=summedits_eval_cfg) diff --git a/configs/datasets/summedits/summedits_gen_4fb38b.py b/configs/datasets/summedits/summedits_gen_4fb38b.py index 4fcdef17..28ea6635 100644 --- a/configs/datasets/summedits/summedits_gen_4fb38b.py +++ b/configs/datasets/summedits/summedits_gen_4fb38b.py @@ -31,7 +31,7 @@ summedits_datasets = [ dict( abbr='summedits', type=SummeditsDataset_V2, - path='./data/summedits/summedits.jsonl', + path='opencompass/summedits', reader_cfg=summedits_reader_cfg, infer_cfg=summedits_infer_cfg, eval_cfg=summedits_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_gen_0356ec.py b/configs/datasets/triviaqa/triviaqa_gen_0356ec.py index e58d732b..fffb3b46 100644 --- a/configs/datasets/triviaqa/triviaqa_gen_0356ec.py +++ b/configs/datasets/triviaqa/triviaqa_gen_0356ec.py @@ -3,7 +3,6 @@ from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import TriviaQADataset, TriviaQAEvaluator - triviaqa_datasets = [] for k in [0, 1, 5]: triviaqa_reader_cfg = dict( @@ -55,7 +54,7 @@ for k in [0, 1, 5]: dict( type=TriviaQADataset, abbr='triviaqa' if k == 0 else f'triviaqa_{k}shot', - path='./data/triviaqa/', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_gen_2121ce.py b/configs/datasets/triviaqa/triviaqa_gen_2121ce.py index 9e5ed279..d8844c91 100644 --- a/configs/datasets/triviaqa/triviaqa_gen_2121ce.py +++ b/configs/datasets/triviaqa/triviaqa_gen_2121ce.py @@ -27,7 +27,7 @@ triviaqa_datasets = [ dict( type=TriviaQADataset, abbr='triviaqa', - path='./data/triviaqa/', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_gen_3e39a5.py b/configs/datasets/triviaqa/triviaqa_gen_3e39a5.py index ec60ad5a..27cc7d8b 100644 --- a/configs/datasets/triviaqa/triviaqa_gen_3e39a5.py +++ b/configs/datasets/triviaqa/triviaqa_gen_3e39a5.py @@ -26,7 +26,7 @@ triviaqa_datasets = [ dict( type=TriviaQADataset, abbr='triviaqa', - path='./data/triviaqa/', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_gen_429db5.py b/configs/datasets/triviaqa/triviaqa_gen_429db5.py index 3265a142..9100a303 100644 --- a/configs/datasets/triviaqa/triviaqa_gen_429db5.py +++ b/configs/datasets/triviaqa/triviaqa_gen_429db5.py @@ -23,7 +23,7 @@ triviaqa_datasets = [ dict( type=TriviaQADataset, abbr='triviaqa', - path='./data/triviaqa/', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_gen_d297bb.py b/configs/datasets/triviaqa/triviaqa_gen_d297bb.py index c8cf362e..a7681883 100644 --- a/configs/datasets/triviaqa/triviaqa_gen_d297bb.py +++ b/configs/datasets/triviaqa/triviaqa_gen_d297bb.py @@ -27,7 +27,7 @@ triviaqa_datasets = [ dict( type=TriviaQADataset, abbr='triviaqa', - path='./data/triviaqa/', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py index b9fd821c..b746e6ea 100644 --- a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py +++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV2, TriviaQAEvaluator triviaqa_datasets = [] @@ -37,9 +37,9 @@ for k in [1]: triviaqa_datasets.append( dict( - type=TriviaQADataset_V2, + type=TriviaQADatasetV2, abbr=f'triviaqa_wiki_{k}shot', - path='./data/triviaqa', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py index 156534dd..2a0aacfd 100644 --- a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py +++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV2, TriviaQAEvaluator triviaqa_datasets = [] @@ -53,9 +53,9 @@ for k in [1]: triviaqa_datasets.append( dict( - type=TriviaQADataset_V2, + type=TriviaQADatasetV2, abbr=f'triviaqa_wiki_{k}shot', - path='./data/triviaqa', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py index 0980fd7c..88583c5b 100644 --- a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py +++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV2, TriviaQAEvaluator triviaqa_datasets = [] @@ -53,9 +53,9 @@ for k in [1]: triviaqa_datasets.append( dict( - type=TriviaQADataset_V2, + type=TriviaQADatasetV2, abbr=f'triviaqa_wiki_{k}shot', - path='./data/triviaqa', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/triviaqa/triviaqa_wiki_gen_d18bf4.py b/configs/datasets/triviaqa/triviaqa_wiki_gen_d18bf4.py index e3735a11..4931d685 100644 --- a/configs/datasets/triviaqa/triviaqa_wiki_gen_d18bf4.py +++ b/configs/datasets/triviaqa/triviaqa_wiki_gen_d18bf4.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator +from opencompass.datasets import TriviaQADatasetV2, TriviaQAEvaluator triviaqa_datasets = [] @@ -53,9 +53,9 @@ for k in [0, 1, 5, 25]: triviaqa_datasets.append( dict( - type=TriviaQADataset_V2, + type=TriviaQADatasetV2, abbr=f'triviaqa_wiki_{k}shot', - path='./data/triviaqa', + path='opencompass/trivia_qa', reader_cfg=triviaqa_reader_cfg, infer_cfg=triviaqa_infer_cfg, eval_cfg=triviaqa_eval_cfg) diff --git a/configs/datasets/tydiqa/tydiqa_gen_978d2a.py b/configs/datasets/tydiqa/tydiqa_gen_978d2a.py index 54dd4e4f..da4f7123 100644 --- a/configs/datasets/tydiqa/tydiqa_gen_978d2a.py +++ b/configs/datasets/tydiqa/tydiqa_gen_978d2a.py @@ -2,6 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import TydiQADataset, TydiQAEvaluator +from os import environ # All configs are for TydiQA Goldp task tydiqa_reader_cfg = dict( @@ -44,10 +45,14 @@ for _lang in langs: ds_column='answer', ) + # Skip japanese due to filter rules of Modelscope + if environ.get('DATASET_SOURCE') == 'Modelscope' and _lang == 'japanese': + continue + tydiqa_datasets.append( dict(abbr=f'tydiqa-goldp_{_lang}', type=TydiQADataset, - path='./data/tydiqa', + path='opencompass/tydiqa', lang=_lang, reader_cfg=tydiqa_reader_cfg, infer_cfg=tydiqa_infer_cfg, diff --git a/configs/datasets/winograd/winograd_ppl_8f3049.py b/configs/datasets/winograd/winograd_ppl_8f3049.py index c6330dee..4a88c6a4 100644 --- a/configs/datasets/winograd/winograd_ppl_8f3049.py +++ b/configs/datasets/winograd/winograd_ppl_8f3049.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winogradDataset +from opencompass.datasets import WinogradDataset winograd_reader_cfg = dict( input_columns=['opt1', 'opt2'], @@ -27,7 +27,7 @@ winograd_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) winograd_datasets = [ dict( abbr='winograd', - type=winogradDataset, + type=WinogradDataset, path='winograd_wsc', name='wsc285', reader_cfg=winograd_reader_cfg, diff --git a/configs/datasets/winograd/winograd_ppl_b6c7ed.py b/configs/datasets/winograd/winograd_ppl_b6c7ed.py index bbd20f16..603bf5e9 100644 --- a/configs/datasets/winograd/winograd_ppl_b6c7ed.py +++ b/configs/datasets/winograd/winograd_ppl_b6c7ed.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winogradDataset +from opencompass.datasets import WinogradDataset winograd_reader_cfg = dict( input_columns=['prompt', 'pronoun', 'opt1', 'opt2'], @@ -31,7 +31,7 @@ winograd_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) winograd_datasets = [ dict( abbr='winograd', - type=winogradDataset, + type=WinogradDataset, path='winograd_wsc', name='wsc285', reader_cfg=winograd_reader_cfg, diff --git a/configs/datasets/winogrande/deprecated_winogrande_gen_a9ede5.py b/configs/datasets/winogrande/deprecated_winogrande_gen_a9ede5.py index b87f8551..69dcfa58 100644 --- a/configs/datasets/winogrande/deprecated_winogrande_gen_a9ede5.py +++ b/configs/datasets/winogrande/deprecated_winogrande_gen_a9ede5.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset_V2 +from opencompass.datasets import WinograndeDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( @@ -34,7 +34,7 @@ winogrande_eval_cfg = dict( winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset_V2, + type=WinograndeDatasetV2, path='./data/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, diff --git a/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py b/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py index fb7b37a5..d56a22ca 100644 --- a/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py +++ b/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset_V3 +from opencompass.datasets import WinograndeDatasetV3 from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( @@ -37,8 +37,8 @@ winogrande_eval_cfg = dict( winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset_V3, - path='./data/winogrande', + type=WinograndeDatasetV3, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg, diff --git a/configs/datasets/winogrande/winogrande_5shot_gen_b36770.py b/configs/datasets/winogrande/winogrande_5shot_gen_b36770.py index 45ee50aa..a5e6b90c 100644 --- a/configs/datasets/winogrande/winogrande_5shot_gen_b36770.py +++ b/configs/datasets/winogrande/winogrande_5shot_gen_b36770.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import winograndeDataset_V3 +from opencompass.datasets import WinograndeDatasetV3 from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( @@ -37,8 +37,8 @@ winogrande_eval_cfg = dict( winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset_V3, - path='./data/winogrande', + type=WinograndeDatasetV3, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg, diff --git a/configs/datasets/winogrande/winogrande_5shot_ll_252f01.py b/configs/datasets/winogrande/winogrande_5shot_ll_252f01.py index 438dc203..29da086e 100644 --- a/configs/datasets/winogrande/winogrande_5shot_ll_252f01.py +++ b/configs/datasets/winogrande/winogrande_5shot_ll_252f01.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import LLInferencer from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import winograndeDataset_V3 +from opencompass.datasets import WinograndeDatasetV3 winogrande_reader_cfg = dict( input_columns=['opt1', 'opt2'], @@ -30,8 +30,8 @@ winogrande_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator)) winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset_V3, - path='./data/winogrande', + type=WinograndeDatasetV3, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/datasets/winogrande/winogrande_gen_458220.py b/configs/datasets/winogrande/winogrande_gen_458220.py index b3413d8e..e6ac0aff 100644 --- a/configs/datasets/winogrande/winogrande_gen_458220.py +++ b/configs/datasets/winogrande/winogrande_gen_458220.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset_V2 +from opencompass.datasets import WinograndeDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( @@ -32,8 +32,8 @@ winogrande_eval_cfg = dict( winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset_V2, - path='./data/winogrande', + type=WinograndeDatasetV2, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg, diff --git a/configs/datasets/winogrande/winogrande_gen_a027b6.py b/configs/datasets/winogrande/winogrande_gen_a027b6.py index 001286fd..02d15fc2 100644 --- a/configs/datasets/winogrande/winogrande_gen_a027b6.py +++ b/configs/datasets/winogrande/winogrande_gen_a027b6.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset_V2 +from opencompass.datasets import WinograndeDatasetV2 from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( @@ -27,8 +27,8 @@ for _choice in _winogrande_prompt: winogrande_datasets.append( dict( abbr='winogrande_'+_choice, - type=winograndeDataset_V2, - path='./data/winogrande', + type=WinograndeDatasetV2, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=dict( prompt_template=dict( diff --git a/configs/datasets/winogrande/winogrande_ll_c5cf57.py b/configs/datasets/winogrande/winogrande_ll_c5cf57.py index d0bc6843..7d0b595b 100644 --- a/configs/datasets/winogrande/winogrande_ll_c5cf57.py +++ b/configs/datasets/winogrande/winogrande_ll_c5cf57.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import LLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset +from opencompass.datasets import WinograndeDataset winogrande_reader_cfg = dict( input_columns=['opt1', 'opt2'], @@ -25,8 +25,8 @@ winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset, - path='./data/winogrande', + type=WinograndeDataset, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/datasets/winogrande/winogrande_ppl_55a66e.py b/configs/datasets/winogrande/winogrande_ppl_55a66e.py index da0163ac..59470b52 100644 --- a/configs/datasets/winogrande/winogrande_ppl_55a66e.py +++ b/configs/datasets/winogrande/winogrande_ppl_55a66e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset +from opencompass.datasets import WinograndeDataset # WARNING: This config cannot reproduce results in the paper. # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config) @@ -30,8 +30,8 @@ winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset, - path='./data/winogrande', + type=WinograndeDataset, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/datasets/winogrande/winogrande_ppl_9307fd.py b/configs/datasets/winogrande/winogrande_ppl_9307fd.py index 7e30fc9a..1c772098 100644 --- a/configs/datasets/winogrande/winogrande_ppl_9307fd.py +++ b/configs/datasets/winogrande/winogrande_ppl_9307fd.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import winograndeDataset +from opencompass.datasets import WinograndeDataset # WARNING: This config cannot reproduce results in the paper. # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config) @@ -28,8 +28,8 @@ winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) winogrande_datasets = [ dict( abbr='winogrande', - type=winograndeDataset, - path='./data/winogrande', + type=WinograndeDataset, + path='opencompass/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/datasets/z_bench/z_bench_gen.py b/configs/datasets/z_bench/z_bench_gen.py deleted file mode 100644 index a30a1a12..00000000 --- a/configs/datasets/z_bench/z_bench_gen.py +++ /dev/null @@ -1,4 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .z_bench_gen_52ba2f import z_bench_datasets # noqa: F401, F403 diff --git a/configs/datasets/z_bench/z_bench_gen_52ba2f.py b/configs/datasets/z_bench/z_bench_gen_52ba2f.py deleted file mode 100644 index 39e3f1f9..00000000 --- a/configs/datasets/z_bench/z_bench_gen_52ba2f.py +++ /dev/null @@ -1,25 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset - -z_bench_reader_cfg = dict( - input_columns=['text'], output_column='category', train_split='test') - -z_bench_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='{text}', - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) - -z_bench_datasets = dict( - type=HFDataset, - path= - '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', - data_dir= - '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', - name='question', - reader_cfg=z_bench_reader_cfg, - infer_cfg=z_bench_infer_cfg) diff --git a/configs/datasets/z_bench/z_bench_gen_d8c84c.py b/configs/datasets/z_bench/z_bench_gen_d8c84c.py deleted file mode 100644 index 28a492d1..00000000 --- a/configs/datasets/z_bench/z_bench_gen_d8c84c.py +++ /dev/null @@ -1,28 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset - -z_bench_reader_cfg = dict( - ds_size=4, - input_columns=['text'], - output_column='category', - train_split='test') - -z_bench_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[dict(role='HUMAN', prompt='{text}')]), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) - -z_bench_datasets = dict( - type=HFDataset, - path= - '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', - data_dir= - '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', - name='question', - reader_cfg=z_bench_reader_cfg, - infer_cfg=z_bench_infer_cfg) diff --git a/configs/eval_modelscope_datasets.py b/configs/eval_modelscope_datasets.py new file mode 100644 index 00000000..945d2c23 --- /dev/null +++ b/configs/eval_modelscope_datasets.py @@ -0,0 +1,82 @@ + +# export DATASET_SOURCE='ModelScope' # before run this script +from datasets import Dataset, DatasetDict +from mmengine.config import read_base +from tqdm import tqdm + +with read_base(): + from .datasets.ceval.ceval_gen import ceval_datasets # ok + from .datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets # ok + + from .datasets.mmlu.mmlu_gen import mmlu_datasets # ok + from .datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets # ok + from .datasets.cmmlu.cmmlu_gen import cmmlu_datasets # ok + + from .datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets # ok + from .datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets # ok + from .datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets # ok + + from .datasets.humaneval.humaneval_gen import humaneval_datasets # ok + from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets # ok + + + from .datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets # 额外处理gpt + + from .datasets.strategyqa.strategyqa_gen import strategyqa_datasets + from .datasets.bbh.bbh_gen import bbh_datasets + from .datasets.Xsum.Xsum_gen import Xsum_datasets + from .datasets.winogrande.winogrande_gen import winogrande_datasets + from .datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets # ok + from .datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets # ok + from .datasets.obqa.obqa_gen import obqa_datasets # ok + from .datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets # ok + from .datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets # ok + from .datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets # ok + + from .datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets # ok + from .datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets # ok + from .datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets # ok + from .datasets.storycloze.storycloze_gen import storycloze_datasets # ok + from .datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets # ok + from .datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets # ok + + from .datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets # ok + from .datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets # ok + from .datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets # ok + from .datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets # ok + from .datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets # ok + + from .datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets # ok + from .datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets # ok + from .datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets # ok + from .datasets.nq.nq_gen import nq_datasets # ok + from .datasets.lcsts.lcsts_gen import lcsts_datasets # ok + + from .datasets.math.math_gen import math_datasets # ok + from .datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets # ok + from .datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets # ok + from .datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets # ok + from .datasets.lambada.lambada_gen import lambada_datasets # ok + from .datasets.tydiqa.tydiqa_gen import tydiqa_datasets # ok + + from .datasets.triviaqa.triviaqa_gen import triviaqa_datasets # ok + from .datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets # ok + + from .datasets.CLUE_afqmc.CLUE_afqmc_gen import afqmc_datasets # ok + from .datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets # ok + from .datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets # ok + from .datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets # ok + + from .datasets.gsm8k.gsm8k_gen import gsm8k_datasets # ok + from .datasets.ARC_c.ARC_c_gen import ARC_c_datasets # ok + from .datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets # ok + from .datasets.ARC_e.ARC_e_gen import ARC_e_datasets # ok + from .datasets.race.race_ppl import race_datasets # ok + from .models.opt.hf_opt_125m import models + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) +for d in datasets: + d['reader_cfg'].update({ + 'train_range':'[0:5]', + 'test_range':'[0:5]' + }) diff --git a/docs/en/advanced_guides/code_eval.md b/docs/en/advanced_guides/code_eval.md index 14748c6c..f7ff9206 100644 --- a/docs/en/advanced_guides/code_eval.md +++ b/docs/en/advanced_guides/code_eval.md @@ -17,13 +17,13 @@ If you need to generate multiple responses for a single example to evaluate the For most models that support the `num_return_sequences` parameter in HF's generation, we can use it directly to obtain multiple responses. Refer to the following configuration file: ```python -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets -mbpp_datasets[0]['type'] = MBPPDataset_V2 +mbpp_datasets[0]['type'] = MBPPDatasetV2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' @@ -59,7 +59,7 @@ You can specifically refer to the following configuration file [configs/eval_cod This applies to some HF models with poorly designed APIs or missing features. In this case, we need to repeatedly construct datasets to achieve multiple response effects. Refer to the following configuration: ```python -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets @@ -69,7 +69,7 @@ humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['num_repeats'] = 10 mbpp_datasets[0]['abbr'] = 'mbpp_pass10' mbpp_datasets[0]['num_repeats'] = 10 -mbpp_datasets[0]['type'] = MBPPDataset_V2 +mbpp_datasets[0]['type'] = MBPPDatasetV2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md index 13cc00f7..b5f1b4c0 100644 --- a/docs/en/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -87,10 +87,20 @@ # Dataset Preparation -The datasets supported by OpenCompass mainly include two parts: +The datasets supported by OpenCompass mainly include three parts: 1. Huggingface datasets: The [Huggingface Datasets](https://huggingface.co/datasets) provide a large number of datasets, which will **automatically download** when running with this option. -2. Custom dataset: OpenCompass also provides some Chinese custom **self-built** datasets. Please run the following command to **manually download and extract** them. + Translate the paragraph into English: + +2. ModelScope Datasets: [ModelScope OpenCompass Dataset](https://modelscope.cn/organization/opencompass) supports automatic downloading of datasets from ModelScope. + + To enable this feature, set the environment variable: `export DATASET_SOURCE=ModelScope`. The available datasets include (sourced from OpenCompassData-core.zip): + + ```plain + humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli + ``` + +3. Custom dataset: OpenCompass also provides some Chinese custom **self-built** datasets. Please run the following command to **manually download and extract** them. Run the following commands to download and place the datasets in the `${OpenCompass}/data` directory can complete dataset preparation. diff --git a/docs/en/user_guides/datasets.md b/docs/en/user_guides/datasets.md index 4f6fc33d..222b303a 100644 --- a/docs/en/user_guides/datasets.md +++ b/docs/en/user_guides/datasets.md @@ -39,7 +39,7 @@ In each dataset configuration file, the dataset will be defined in the `{}_datas afqmc_datasets = [ dict( abbr="afqmc-dev", - type=AFQMCDataset_V2, + type=AFQMCDatasetV2, path="./data/CLUE/AFQMC/dev.json", reader_cfg=afqmc_reader_cfg, infer_cfg=afqmc_infer_cfg, diff --git a/docs/zh_cn/advanced_guides/code_eval.md b/docs/zh_cn/advanced_guides/code_eval.md index 2eb3c67f..91065f40 100644 --- a/docs/zh_cn/advanced_guides/code_eval.md +++ b/docs/zh_cn/advanced_guides/code_eval.md @@ -17,13 +17,13 @@ 对于绝大多数模型来说,模型支持HF的generation中带有`num_return_sequences` 参数,我们可以直接使用来获取多回复。可以参考以下配置文件。 ```python -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets -mbpp_datasets[0]['type'] = MBPPDataset_V2 +mbpp_datasets[0]['type'] = MBPPDatasetV2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' @@ -60,7 +60,7 @@ models = [ 适用于一些没有设计好的API以及功能缺失的HF模型。这个时候我们需要重复构造数据集来达到多回复的效果。这里可以参考以下配置文件。 ```python -from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets @@ -70,7 +70,7 @@ humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['num_repeats'] = 10 mbpp_datasets[0]['abbr'] = 'mbpp_pass10' mbpp_datasets[0]['num_repeats'] = 10 -mbpp_datasets[0]['type'] = MBPPDataset_V2 +mbpp_datasets[0]['type'] = MBPPDatasetV2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md index fd609675..919a2cc9 100644 --- a/docs/zh_cn/get_started/installation.md +++ b/docs/zh_cn/get_started/installation.md @@ -39,11 +39,19 @@ ## 数据集准备 -OpenCompass 支持的数据集主要包括两个部分: +OpenCompass 支持的数据集主要包括三个部分: 1. Huggingface 数据集: [Huggingface Dataset](https://huggingface.co/datasets) 提供了大量的数据集,这部分数据集运行时会**自动下载**。 -2. 自建以及第三方数据集:OpenCompass 还提供了一些第三方数据集及自建**中文**数据集。运行以下命令**手动下载解压**。 +2. ModelScope 数据集:[ModelScope OpenCompass Dataset](https://modelscope.cn/organization/opencompass) 支持从 ModelScope 自动下载数据集。 + + 要启用此功能,请设置环境变量:`export DATASET_SOURCE=ModelScope`,可用的数据集包括(来源于 OpenCompassData-core.zip): + + ```plain + humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli + ``` + +3. 自建以及第三方数据集:OpenCompass 还提供了一些第三方数据集及自建**中文**数据集。运行以下命令**手动下载解压**。 在 OpenCompass 项目根目录下运行下面命令,将数据集准备至 `${OpenCompass}/data` 目录下: diff --git a/docs/zh_cn/user_guides/datasets.md b/docs/zh_cn/user_guides/datasets.md index 2aae7de8..4a0ee8b2 100644 --- a/docs/zh_cn/user_guides/datasets.md +++ b/docs/zh_cn/user_guides/datasets.md @@ -39,7 +39,7 @@ configs/datasets/ afqmc_datasets = [ dict( abbr="afqmc-dev", - type=AFQMCDataset_V2, + type=AFQMCDatasetV2, path="./data/CLUE/AFQMC/dev.json", reader_cfg=afqmc_reader_cfg, infer_cfg=afqmc_infer_cfg, diff --git a/opencompass/datasets/FinanceIQ.py b/opencompass/datasets/FinanceIQ.py index 0816e9f6..ed87d300 100644 --- a/opencompass/datasets/FinanceIQ.py +++ b/opencompass/datasets/FinanceIQ.py @@ -4,6 +4,7 @@ import os.path as osp from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -18,6 +19,7 @@ class FinanceIQDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() for split in ['dev', 'test']: raw_data = [] diff --git a/opencompass/datasets/GaokaoBench.py b/opencompass/datasets/GaokaoBench.py index 1fe9f2c3..38384535 100644 --- a/opencompass/datasets/GaokaoBench.py +++ b/opencompass/datasets/GaokaoBench.py @@ -1,10 +1,12 @@ import json import re +from os import environ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,10 +15,15 @@ from .base import BaseDataset class GaokaoBenchDataset(BaseDataset): @staticmethod - def load(path: str): - with open(path, encoding='utf-8') as f: - data = json.load(f) - return Dataset.from_list(data['example']) + def load(path: str, name: str): + data = get_data_path(path, local_mode=True) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + return MsDataset.load(path, subset_name=name, split='test') + else: + with open(path, encoding='utf-8') as f: + data = json.load(f) + return Dataset.from_list(data['example']) valid_gaokao_bench_question_types = [ diff --git a/opencompass/datasets/IFEval/ifeval.py b/opencompass/datasets/IFEval/ifeval.py index d9248025..521e5a8d 100644 --- a/opencompass/datasets/IFEval/ifeval.py +++ b/opencompass/datasets/IFEval/ifeval.py @@ -4,6 +4,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .evaluation_main import (InputExample, test_instruction_following_loose, @@ -15,6 +16,7 @@ class IFEvalDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) datasets = [] with open(path, 'r', encoding='utf-8') as file: for line in file: diff --git a/opencompass/datasets/MMLUArabic.py b/opencompass/datasets/MMLUArabic.py index 3ee72a97..7b7de0c9 100644 --- a/opencompass/datasets/MMLUArabic.py +++ b/opencompass/datasets/MMLUArabic.py @@ -4,6 +4,7 @@ import os.path as osp from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class MMLUArabicDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() for split in ['dev', 'test']: raw_data = [] diff --git a/opencompass/datasets/NPHardEval/cmp_GCP_D.py b/opencompass/datasets/NPHardEval/cmp_GCP_D.py index b8c5a6ad..5090fb50 100644 --- a/opencompass/datasets/NPHardEval/cmp_GCP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_GCP_D.py @@ -9,6 +9,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import gcp_dPrompts @@ -31,10 +32,11 @@ def q2text(q, p=gcp_dPrompts): @LOAD_DATASET.register_module(force=True) -class cmp_GCP_D_Dataset(BaseDataset): +class CMP_GCP_D_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -55,7 +57,7 @@ class cmp_GCP_D_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class cmp_GCP_D_Evaluator(BaseEvaluator): +class CMP_GCP_D_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/cmp_KSP.py b/opencompass/datasets/NPHardEval/cmp_KSP.py index 1412376e..d1277a79 100644 --- a/opencompass/datasets/NPHardEval/cmp_KSP.py +++ b/opencompass/datasets/NPHardEval/cmp_KSP.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import kspPrompts @@ -25,10 +26,11 @@ def q2text(q, p=kspPrompts): @LOAD_DATASET.register_module(force=True) -class cmp_KSP_Dataset(BaseDataset): +class CMP_KSP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -49,7 +51,7 @@ class cmp_KSP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class cmp_KSP_Evaluator(BaseEvaluator): +class CMP_KSP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/cmp_TSP_D.py b/opencompass/datasets/NPHardEval/cmp_TSP_D.py index c56df0f0..bff15260 100644 --- a/opencompass/datasets/NPHardEval/cmp_TSP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_TSP_D.py @@ -11,6 +11,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import tsp_dPrompts @@ -33,10 +34,11 @@ def q2text(adj_matrix, distance_limit, p=tsp_dPrompts): @LOAD_DATASET.register_module(force=True) -class cmp_TSP_D_Dataset(BaseDataset): +class CMP_TSP_D_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -62,7 +64,7 @@ class cmp_TSP_D_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class cmp_TSP_D_Evaluator(BaseEvaluator): +class CMP_TSP_D_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/hard_GCP.py b/opencompass/datasets/NPHardEval/hard_GCP.py index a66577ad..24cc6fc5 100644 --- a/opencompass/datasets/NPHardEval/hard_GCP.py +++ b/opencompass/datasets/NPHardEval/hard_GCP.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import gcpPrompts @@ -28,10 +29,11 @@ def q2text(q, p=gcpPrompts): # q is the data for the HP-hard question, p is the @LOAD_DATASET.register_module(force=True) -class hard_GCP_Dataset(BaseDataset): +class HardGCPDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -52,7 +54,7 @@ class hard_GCP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class hard_GCP_Evaluator(BaseEvaluator): +class HardGCPEvaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/hard_MSP.py b/opencompass/datasets/NPHardEval/hard_MSP.py index 31e22343..7c4f4db3 100644 --- a/opencompass/datasets/NPHardEval/hard_MSP.py +++ b/opencompass/datasets/NPHardEval/hard_MSP.py @@ -6,6 +6,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import mspPrompts @@ -31,10 +32,11 @@ def q2text(q, p=mspPrompts): # q is the data for the HP-hard question, p is the @LOAD_DATASET.register_module(force=True) -class hard_MSP_Dataset(BaseDataset): +class Hard_MSP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -54,7 +56,7 @@ class hard_MSP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class hard_MSP_Evaluator(BaseEvaluator): +class Hard_MSP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/hard_TSP.py b/opencompass/datasets/NPHardEval/hard_TSP.py index 37bfe36a..fd74a17a 100644 --- a/opencompass/datasets/NPHardEval/hard_TSP.py +++ b/opencompass/datasets/NPHardEval/hard_TSP.py @@ -8,6 +8,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import tspPrompts @@ -29,10 +30,11 @@ def q2text(q, p=tspPrompts): # q is the data for the HP-hard question, p is the @LOAD_DATASET.register_module(force=True) -class hard_TSP_Dataset(BaseDataset): +class Hard_TSP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -56,7 +58,7 @@ class hard_TSP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class hard_TSP_Evaluator(BaseEvaluator): +class Hard_TSP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/p_BSP.py b/opencompass/datasets/NPHardEval/p_BSP.py index 2331ca50..f59c6c5d 100644 --- a/opencompass/datasets/NPHardEval/p_BSP.py +++ b/opencompass/datasets/NPHardEval/p_BSP.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import bspPrompts @@ -24,10 +25,11 @@ def q2text(q, p=bspPrompts): @LOAD_DATASET.register_module(force=True) -class p_BSP_Dataset(BaseDataset): +class P_BSP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data, newdata = [], [] @@ -49,7 +51,7 @@ class p_BSP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class p_BSP_Evaluator(BaseEvaluator): +class P_BSP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/p_EDP.py b/opencompass/datasets/NPHardEval/p_EDP.py index 1085b113..f94116aa 100644 --- a/opencompass/datasets/NPHardEval/p_EDP.py +++ b/opencompass/datasets/NPHardEval/p_EDP.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import edpPrompts @@ -21,10 +22,11 @@ def q2text(q, p=edpPrompts): @LOAD_DATASET.register_module(force=True) -class p_EDP_Dataset(BaseDataset): +class P_EDP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -46,7 +48,7 @@ class p_EDP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class p_EDP_Evaluator(BaseEvaluator): +class P_EDP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/NPHardEval/p_SPP.py b/opencompass/datasets/NPHardEval/p_SPP.py index 6ade3d82..d9b073e9 100644 --- a/opencompass/datasets/NPHardEval/p_SPP.py +++ b/opencompass/datasets/NPHardEval/p_SPP.py @@ -10,6 +10,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .prompts import sppPrompts @@ -34,10 +35,11 @@ def q2text(q, p=sppPrompts): @LOAD_DATASET.register_module(force=True) -class p_SPP_Dataset(BaseDataset): +class P_SPP_Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] data_path = path all_data = [] @@ -56,7 +58,7 @@ class p_SPP_Dataset(BaseDataset): @ICL_EVALUATORS.register_module(force=True) -class p_SPP_Evaluator(BaseEvaluator): +class P_SPP_Evaluator(BaseEvaluator): def score(self, predictions, references): assert len(predictions) == len(references) diff --git a/opencompass/datasets/OpenFinData.py b/opencompass/datasets/OpenFinData.py index 7320eaf0..4777b17f 100644 --- a/opencompass/datasets/OpenFinData.py +++ b/opencompass/datasets/OpenFinData.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -14,6 +15,7 @@ class OpenFinDataDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) with open(osp.join(path, f'{name}.json'), 'r') as f: data = json.load(f) return Dataset.from_list(data) diff --git a/opencompass/datasets/QuALITY.py b/opencompass/datasets/QuALITY.py index 23cc550b..a9f53435 100644 --- a/opencompass/datasets/QuALITY.py +++ b/opencompass/datasets/QuALITY.py @@ -4,6 +4,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class QuALITYDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset_list = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/TheoremQA/legacy.py b/opencompass/datasets/TheoremQA/legacy.py index 4d30b779..5a4b2563 100644 --- a/opencompass/datasets/TheoremQA/legacy.py +++ b/opencompass/datasets/TheoremQA/legacy.py @@ -3,6 +3,7 @@ import re from datasets import load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -12,6 +13,7 @@ class TheoremQADataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) return load_dataset('csv', data_files={'test': path}) diff --git a/opencompass/datasets/TheoremQA/main.py b/opencompass/datasets/TheoremQA/main.py index 9e574ddd..4500d09d 100644 --- a/opencompass/datasets/TheoremQA/main.py +++ b/opencompass/datasets/TheoremQA/main.py @@ -4,6 +4,7 @@ import json from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS +from opencompass.utils import get_data_path from opencompass.openicl.icl_evaluator import BaseEvaluator from ..base import BaseDataset @@ -16,6 +17,7 @@ class TheoremQADatasetV3(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r') as f: data = json.load(f) for item in data: diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 936ba45f..e8bb6a2e 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -50,6 +50,7 @@ from .gsm_hard import * # noqa: F401, F403 from .hellaswag import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403 +from .humaneval_multi import * # noqa: F401, F403 from .humanevalx import * # noqa: F401, F403 from .hungarian_math import * # noqa: F401, F403 from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 diff --git a/opencompass/datasets/advglue.py b/opencompass/datasets/advglue.py index 43303ba4..db884d0b 100644 --- a/opencompass/datasets/advglue.py +++ b/opencompass/datasets/advglue.py @@ -4,6 +4,7 @@ from typing import List, Union from datasets import Dataset, concatenate_datasets from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils import get_data_path from .base import BaseDataset @@ -51,6 +52,7 @@ class AdvDataset(BaseDataset): def load(self, path): """Load dataset and aug with original dataset.""" + path = get_data_path(path) with open(path, 'r') as f: raw_data = json.load(f) subset = raw_data[self.subset] diff --git a/opencompass/datasets/afqmcd.py b/opencompass/datasets/afqmcd.py index f23ae6c3..bb50bede 100644 --- a/opencompass/datasets/afqmcd.py +++ b/opencompass/datasets/afqmcd.py @@ -1,21 +1,33 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils.datasets import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class AFQMCDataset_V2(BaseDataset): +class AFQMCDatasetV2(BaseDataset): @staticmethod - def load(path): - data = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) - line['label'] = 'AB'[int(line['label'])] + def load(path, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='dev') + data = [] + for line in ms_dataset: + row = line + row['label'] = 'AB'[int(line['label'])] data.append(line) + else: + data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['label'] = 'AB'[int(line['label'])] + data.append(line) return Dataset.from_list(data) diff --git a/opencompass/datasets/agieval/agieval.py b/opencompass/datasets/agieval/agieval.py index e10a17cc..40db5845 100644 --- a/opencompass/datasets/agieval/agieval.py +++ b/opencompass/datasets/agieval/agieval.py @@ -1,10 +1,12 @@ import json import os.path as osp +from os import environ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .math_equivalence import is_equiv @@ -16,6 +18,7 @@ class AGIEvalDataset(BaseDataset): @staticmethod def load(path: str, name: str, setting_name: str): + path = get_data_path(path) from .dataset_loader import load_dataset, load_dataset_as_result_schema assert setting_name in 'zero-shot', 'only support zero-shot setting' @@ -37,25 +40,48 @@ class AGIEvalDataset_v2(BaseDataset): @staticmethod def load(path: str, name: str, setting_name: str): + path = get_data_path(path) assert setting_name in 'zero-shot', 'only support zero-shot setting' - filename = osp.join(path, name + '.jsonl') - with open(filename, encoding='utf-8') as f: - data = [json.loads(line.strip()) for line in f] - dataset = [] - for item in data: - passage = item['passage'] if item['passage'] else '' - question = passage + item['question'] - options = '\n'.join(item['options']) if item['options'] else '' - if item['label']: - if isinstance(item['label'], list): - label = ''.join(item['label']) + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, subset_name=name, split='test') + dataset = [] + for item in ms_dataset: + passage = item['passage'] if item['passage'] else '' + question = passage + item['question'] + options = '\n'.join(item['options']) if item['options'] else '' + if item['label']: + try: + label = eval(item['label']) + except Exception: + label = item['label'] + if isinstance(label, list): + label = ''.join(label) else: - label = item['label'] - else: - label = item['answer'] - d = {'question': question, 'options': options, 'label': label} - dataset.append(d) - dataset = Dataset.from_list(dataset) + label = item['answer'] + d = {'question': question, 'options': options, 'label': label} + dataset.append(d) + dataset = Dataset.from_list(dataset) + else: + filename = osp.join(path, name + '.jsonl') + with open(filename, encoding='utf-8') as f: + data = [json.loads(line.strip()) for line in f] + dataset = [] + for item in data: + passage = item['passage'] if item['passage'] else '' + question = passage + item['question'] + options = '\n'.join(item['options']) if item['options'] else '' + if item['label']: + if isinstance(item['label'], list): + label = ''.join(item['label']) + else: + label = item['label'] + else: + label = item['answer'] + d = {'question': question, 'options': options, 'label': label} + dataset.append(d) + dataset = Dataset.from_list(dataset) return dataset diff --git a/opencompass/datasets/agieval/dataset_loader.py b/opencompass/datasets/agieval/dataset_loader.py index dda23238..75d90599 100644 --- a/opencompass/datasets/agieval/dataset_loader.py +++ b/opencompass/datasets/agieval/dataset_loader.py @@ -2,6 +2,7 @@ import ast import json import os +from os import environ import pandas as pd import tiktoken @@ -279,8 +280,15 @@ def load_dataset(dataset_name, end_of_example='\n', chat_mode=False, verbose=False): - test_path = os.path.join(parent_path, dataset_name + '.jsonl') - loaded_jsonl = read_jsonl(test_path) + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + loaded_jsonl = MsDataset.load(parent_path, + subset_name=dataset_name, + split='test') + else: + test_path = os.path.join(parent_path, dataset_name + '.jsonl') + loaded_jsonl = read_jsonl(test_path) processed = [] if setting_name == 'few-shot-CoT' or setting_name == 'few-shot': # process demo once if it is few-shot-CoT @@ -356,8 +364,15 @@ def generate_second_stage_input(dataset_name, def load_dataset_as_result_schema(dataset_name, parent_path): - test_path = os.path.join(parent_path, dataset_name + '.jsonl') - loaded_jsonl = read_jsonl(test_path) + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + loaded_jsonl = MsDataset.load(parent_path, + subset_name=dataset_name, + split='test') + else: + test_path = os.path.join(parent_path, dataset_name + '.jsonl') + loaded_jsonl = read_jsonl(test_path) processed = [] for i, line in enumerate(loaded_jsonl): diff --git a/opencompass/datasets/arc.py b/opencompass/datasets/arc.py index 9b512114..97b4dafc 100644 --- a/opencompass/datasets/arc.py +++ b/opencompass/datasets/arc.py @@ -1,9 +1,11 @@ import json import os.path as osp +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,25 +14,51 @@ from .base import BaseDataset class ARCDataset(BaseDataset): @staticmethod - def load(path: str): - with open(path, 'r', errors='ignore') as in_f: + def load(path: str, name: str): + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, + split='validation', + subset_name=name) rows = [] - for line in in_f: - item = json.loads(line.strip()) - question = item['question'] - if len(question['choices']) != 4: + for row in dataset: + answerKey = row['answerKey'] + question = row['question'] + choices = row['choices'] + if len(choices['text']) != 4: continue - labels = [c['label'] for c in question['choices']] - answerKey = 'ABCD'[labels.index(item['answerKey'])] + labels = row['choices']['label'] + answerKey = 'ABCD'[labels.index(answerKey)] + rows.append({ - 'question': question['stem'], + 'question': question, 'answerKey': answerKey, - 'textA': question['choices'][0]['text'], - 'textB': question['choices'][1]['text'], - 'textC': question['choices'][2]['text'], - 'textD': question['choices'][3]['text'], + 'textA': choices['text'][0], + 'textB': choices['text'][1], + 'textC': choices['text'][2], + 'textD': choices['text'][3], }) - return Dataset.from_list(rows) + else: + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for line in in_f: + item = json.loads(line.strip()) + question = item['question'] + if len(question['choices']) != 4: + continue + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] + rows.append({ + 'question': question['stem'], + 'answerKey': answerKey, + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], + }) + dataset = Dataset.from_list(rows) + return dataset class ARCDatasetClean(BaseDataset): @@ -42,43 +70,80 @@ class ARCDatasetClean(BaseDataset): import requests assert split == 'test', 'We only have test set annotation for ARC' - annotation_cache_path = osp.join( - path, f'ARC_c_{split}_contamination_annotations.json') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.utils.config_ds import MS_DATASETS_CACHE + annotation_cache_path = osp.join( + MS_DATASETS_CACHE, + f'ARC_c_{split}_contamination_annotations.json') + link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/ARC_annotations.json' # noqa + else: + annotation_cache_path = osp.join( + path, f'ARC_c_{split}_contamination_annotations.json') + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa + if osp.exists(annotation_cache_path): with open(annotation_cache_path, 'r') as f: annotations = json.load(f) return annotations - link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) with open(annotation_cache_path, 'w') as f: json.dump(annotations, f) return annotations @staticmethod - def load(path: str): + def load(path: str, name: str): + path = get_data_path(path) annotations = ARCDatasetClean.load_contamination_annotations( osp.dirname(path), 'test') - with open(path, 'r', errors='ignore') as in_f: + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, split='test', subset_name=name) rows = [] - for line in in_f: - item = json.loads(line.strip()) - id_ = item['id'] - question = item['question'] + for row in dataset: + answerKey = row['answerKey'] + question = row['question'] + choices = row['choices'] + if len(choices['text']) != 4: + continue + labels = row['choices']['label'] + answerKey = 'ABCD'[labels.index(answerKey)] + id_ = row['id'] if id_ in annotations: is_clean = annotations[id_][0] else: is_clean = 'not labeled' - if len(question['choices']) != 4: - continue - labels = [c['label'] for c in question['choices']] - answerKey = 'ABCD'[labels.index(item['answerKey'])] rows.append({ - 'question': question['stem'], + 'question': question, 'answerKey': answerKey, - 'textA': question['choices'][0]['text'], - 'textB': question['choices'][1]['text'], - 'textC': question['choices'][2]['text'], - 'textD': question['choices'][3]['text'], + 'textA': choices['text'][0], + 'textB': choices['text'][1], + 'textC': choices['text'][2], + 'textD': choices['text'][3], 'is_clean': is_clean, }) - return Dataset.from_list(rows) + else: + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for line in in_f: + item = json.loads(line.strip()) + id_ = item['id'] + question = item['question'] + if id_ in annotations: + is_clean = annotations[id_][0] + else: + is_clean = 'not labeled' + if len(question['choices']) != 4: + continue + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] + rows.append({ + 'question': question['stem'], + 'answerKey': answerKey, + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], + 'is_clean': is_clean, + }) + return Dataset.from_list(rows) diff --git a/opencompass/datasets/ax.py b/opencompass/datasets/ax.py index 38656864..0dc542ed 100644 --- a/opencompass/datasets/ax.py +++ b/opencompass/datasets/ax.py @@ -1,24 +1,37 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class AXDataset_V2(BaseDataset): +class AXDatasetV2(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) dataset = [] - with open(path, 'r') as f: - for line in f: - line = json.loads(line) - line['label'] = { - 'entailment': 'A', - 'not_entailment': 'B' - }[line['label']] - dataset.append(line) - return Dataset.from_list(dataset) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load('opencompass/super_glue', + subset_name='axb')['test'] + for data in ms_dataset: + row = data + row['label'] = {0: 'A', 1: 'B'}[data['label']] + dataset.append(row) + else: + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + line['label'] = { + 'entailment': 'A', + 'not_entailment': 'B' + }[line['label']] + dataset.append(line) + dataset = Dataset.from_list(dataset) + return dataset diff --git a/opencompass/datasets/bbh.py b/opencompass/datasets/bbh.py index 38f3de39..7950056f 100644 --- a/opencompass/datasets/bbh.py +++ b/opencompass/datasets/bbh.py @@ -1,12 +1,14 @@ import json import os.path as osp import re +from os import environ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path from .base import BaseDataset @@ -16,9 +18,14 @@ class BBHDataset(BaseDataset): @staticmethod def load(path: str, name: str): - with open(osp.join(path, f'{name}.json'), 'r') as f: - data = json.load(f)['examples'] - dataset = Dataset.from_list(data) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, subset_name=name, split='test') + else: + with open(osp.join(path, f'{name}.json'), 'r') as f: + data = json.load(f)['examples'] + dataset = Dataset.from_list(data) return dataset diff --git a/opencompass/datasets/boolq.py b/opencompass/datasets/boolq.py index 860d0067..318b13da 100644 --- a/opencompass/datasets/boolq.py +++ b/opencompass/datasets/boolq.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -26,10 +27,11 @@ class BoolQDataset(BaseDataset): @LOAD_DATASET.register_module() -class BoolQDataset_V2(BaseDataset): +class BoolQDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r') as f: for line in f: @@ -40,10 +42,11 @@ class BoolQDataset_V2(BaseDataset): @LOAD_DATASET.register_module() -class BoolQDataset_V3(BaseDataset): +class BoolQDatasetV3(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r') as f: for line in f: diff --git a/opencompass/datasets/cb.py b/opencompass/datasets/cb.py index 3027183d..e5da6970 100644 --- a/opencompass/datasets/cb.py +++ b/opencompass/datasets/cb.py @@ -3,15 +3,17 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class CBDataset_V2(BaseDataset): +class CBDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r') as f: for line in f: diff --git a/opencompass/datasets/ceval.py b/opencompass/datasets/ceval.py index 2ac43796..2ae6d8a2 100644 --- a/opencompass/datasets/ceval.py +++ b/opencompass/datasets/ceval.py @@ -1,10 +1,12 @@ import csv import json import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -14,19 +16,28 @@ class CEvalDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path) dataset = {} - for split in ['dev', 'val', 'test']: - filename = osp.join(path, split, f'{name}_{split}.csv') - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - header = next(reader) - for row in reader: - item = dict(zip(header, row)) - item.setdefault('explanation', '') - item.setdefault('answer', '') - dataset.setdefault(split, []).append(item) - dataset = {i: Dataset.from_list(dataset[i]) for i in dataset} - return DatasetDict(dataset) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=path, + subset_name=name, + trust_remote_code=True) + else: + for split in ['dev', 'val', 'test']: + filename = osp.join(path, split, f'{name}_{split}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + header = next(reader) + for row in reader: + item = dict(zip(header, row)) + item.setdefault('explanation', '') + item.setdefault('answer', '') + dataset.setdefault(split, []).append(item) + dataset = DatasetDict( + {i: Dataset.from_list(dataset[i]) + for i in dataset}) + return dataset class CEvalDatasetClean(BaseDataset): @@ -38,13 +49,20 @@ class CEvalDatasetClean(BaseDataset): import requests assert split == 'val', 'Now we only have annotations for val set' - annotation_cache_path = osp.join( - path, split, 'ceval_contamination_annotations.json') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.utils.config_ds import MS_DATASETS_CACHE + annotation_cache_path = osp.join( + MS_DATASETS_CACHE, 'ceval_contamination_annotations.json') + link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/ceval_annotations.json' # noqa + else: + annotation_cache_path = osp.join( + path, split, 'ceval_contamination_annotations.json') + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ceval_annotations.json' # noqa + if osp.exists(annotation_cache_path): with open(annotation_cache_path, 'r') as f: annotations = json.load(f) return annotations - link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ceval_annotations.json' # noqa annotations = json.loads(requests.get(link_of_annotations).text) with open(annotation_cache_path, 'w') as f: json.dump(annotations, f) @@ -52,25 +70,48 @@ class CEvalDatasetClean(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path) dataset = {} - for split in ['dev', 'val', 'test']: - if split == 'val': - annotations = CEvalDatasetClean.load_contamination_annotations( - path, split) - filename = osp.join(path, split, f'{name}_{split}.csv') - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - header = next(reader) - for row_index, row in enumerate(reader): - item = dict(zip(header, row)) - item.setdefault('explanation', '') - item.setdefault('answer', '') - if split == 'val': - row_id = f'{name}-{row_index}' - if row_id in annotations: - item['is_clean'] = annotations[row_id][0] - else: - item['is_clean'] = 'not labeled' - dataset.setdefault(split, []).append(item) - dataset = {i: Dataset.from_list(dataset[i]) for i in dataset} - return DatasetDict(dataset) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=path, subset_name=name) + # 向该数据添加 'is_clean' 字段 + annotations = CEvalDatasetClean.load_contamination_annotations( + path, 'val') + val = dataset['val'] + val_data = [] + for index in range(val.num_rows): + row = val[index] + row_id = f'{name}-{index}' + row.update({ + 'is_clean': + annotations[row_id][0] + if row_id in annotations else 'not labeled' + }) + val_data.append(row) + dataset['val'] = Dataset.from_list(val_data) + else: + for split in ['dev', 'val', 'test']: + if split == 'val': + annotations = \ + CEvalDatasetClean.load_contamination_annotations( + path, split) + filename = osp.join(path, split, f'{name}_{split}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + header = next(reader) + for row_index, row in enumerate(reader): + item = dict(zip(header, row)) + item.setdefault('explanation', '') + item.setdefault('answer', '') + if split == 'val': + row_id = f'{name}-{row_index}' + if row_id in annotations: + item['is_clean'] = annotations[row_id][0] + else: + item['is_clean'] = 'not labeled' + dataset.setdefault(split, []).append(item) + dataset = DatasetDict( + {i: Dataset.from_list(dataset[i]) + for i in dataset}) + return dataset diff --git a/opencompass/datasets/chid.py b/opencompass/datasets/chid.py index a7a4ae5c..26fe9e2e 100644 --- a/opencompass/datasets/chid.py +++ b/opencompass/datasets/chid.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,9 @@ class CHIDDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -26,10 +30,11 @@ class CHIDDataset(BaseDataset): @LOAD_DATASET.register_module() -class CHIDDataset_V2(BaseDataset): +class CHIDDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/circular.py b/opencompass/datasets/circular.py index 13025bb8..282c6331 100644 --- a/opencompass/datasets/circular.py +++ b/opencompass/datasets/circular.py @@ -10,12 +10,12 @@ from .arc import ARCDataset from .ceval import CEvalDataset from .cmmlu import CMMLUDataset from .commonsenseqa import commonsenseqaDataset -from .hellaswag import hellaswagDataset_V2 +from .hellaswag import HellaswagDataset_V2 from .mmlu import MMLUDataset from .obqa import OBQADataset -from .piqa import piqaDataset_V2 +from .piqa import PIQADatasetV2 from .race import RaceDataset -from .siqa import siqaDataset_V3 +from .siqa import SiqaDatasetV3 from .xiezhi import XiezhiDataset @@ -247,8 +247,8 @@ class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta): return item -class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta): - dataset_class = hellaswagDataset_V2 +class CircularHSWAGDataset(HellaswagDataset_V2, metaclass=CircularDatasetMeta): + dataset_class = HellaswagDataset_V2 default_circular_splits = None default_option_keys = ['A', 'B', 'C', 'D'] default_answer_key = 'label' @@ -275,15 +275,15 @@ class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta): default_answer_key = 'answer' -class CircularsiqaDataset(siqaDataset_V3, metaclass=CircularDatasetMeta): - dataset_class = siqaDataset_V3 +class CircularsiqaDataset(SiqaDatasetV3, metaclass=CircularDatasetMeta): + dataset_class = SiqaDatasetV3 default_circular_splits = ['validation'] default_option_keys = ['A', 'B', 'C'] default_answer_key = 'answer' -class CircularpiqaDataset(piqaDataset_V2, metaclass=CircularDatasetMeta): - dataset_class = piqaDataset_V2 +class CircularPIQADataset(PIQADatasetV2, metaclass=CircularDatasetMeta): + dataset_class = PIQADatasetV2 default_circular_splits = ['validation'] default_option_keys = ['sol1', 'sol2'] diff --git a/opencompass/datasets/clozeTest_maxmin.py b/opencompass/datasets/clozeTest_maxmin.py index 93288bf2..51888ec1 100644 --- a/opencompass/datasets/clozeTest_maxmin.py +++ b/opencompass/datasets/clozeTest_maxmin.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,7 +13,9 @@ class MaxminDataset(BaseDataset): @staticmethod def load(test_path, answer_path=None): + test_path = get_data_path(test_path) if answer_path is not None: + answer_path = get_data_path(answer_path) with open(answer_path, 'r', encoding='utf-8') as answer_f: answers = {} for line in answer_f.readlines(): diff --git a/opencompass/datasets/cluewsc.py b/opencompass/datasets/cluewsc.py index 8f62b344..a3c4d847 100644 --- a/opencompass/datasets/cluewsc.py +++ b/opencompass/datasets/cluewsc.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,9 @@ class CluewscDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -36,10 +40,11 @@ class CluewscDataset(BaseDataset): @LOAD_DATASET.register_module() -class CluewscDataset_V2(BaseDataset): +class CluewscDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/cmb.py b/opencompass/datasets/cmb.py index d09f0f8b..da854556 100644 --- a/opencompass/datasets/cmb.py +++ b/opencompass/datasets/cmb.py @@ -4,6 +4,7 @@ import os.path as osp from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class CMBDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f: val_data = json.load(f) for d in val_data: diff --git a/opencompass/datasets/cmmlu.py b/opencompass/datasets/cmmlu.py index 634cc929..72341e78 100644 --- a/opencompass/datasets/cmmlu.py +++ b/opencompass/datasets/cmmlu.py @@ -1,9 +1,11 @@ import csv import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,22 +15,43 @@ class CMMLUDataset(BaseDataset): @staticmethod def load(path: str, name: str): - dataset = DatasetDict() - for split in ['dev', 'test']: - raw_data = [] - filename = osp.join(path, split, f'{name}.csv') - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - _ = next(reader) # skip the header - for row in reader: - assert len(row) == 7 + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, + subset_name=name, + trust_remote_code=True) + modified_dataset = DatasetDict() + for split in dataset.keys(): + raw_data = [] + for data in dataset[split]: raw_data.append({ - 'question': row[1], - 'A': row[2], - 'B': row[3], - 'C': row[4], - 'D': row[5], - 'answer': row[6], + 'question': data['Question'], # 修改字段 + 'A': data['A'], + 'B': data['B'], + 'C': data['C'], + 'D': data['D'], + 'answer': data['Answer'] # 修改字段 }) - dataset[split] = Dataset.from_list(raw_data) + modified_dataset[split] = Dataset.from_list(raw_data) + dataset = modified_dataset + else: + dataset = DatasetDict() + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + _ = next(reader) # skip the header + for row in reader: + assert len(row) == 7 + raw_data.append({ + 'question': row[1], + 'A': row[2], + 'B': row[3], + 'C': row[4], + 'D': row[5], + 'answer': row[6], + }) + dataset[split] = Dataset.from_list(raw_data) return dataset diff --git a/opencompass/datasets/cmnli.py b/opencompass/datasets/cmnli.py index 2e431d76..2d043fc9 100644 --- a/opencompass/datasets/cmnli.py +++ b/opencompass/datasets/cmnli.py @@ -1,42 +1,71 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class cmnliDataset(BaseDataset): +class CMNLIDataset(BaseDataset): @staticmethod - def load(path): - data = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + def load(path, local_mode: bool = False): + path = get_data_path(path, local_mode=local_mode) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='dev') + data = [] + for line in ms_dataset: + row = line if line['label'] == '-': continue - data.append(line) + data.append(row) + else: + data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + if line['label'] == '-': + continue + data.append(line) return Dataset.from_list(data) @LOAD_DATASET.register_module() -class cmnliDataset_V2(BaseDataset): +class CMNLIDatasetV2(BaseDataset): @staticmethod - def load(path): - data = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + def load(path, local_mode: bool = False): + path = get_data_path(path, local_mode=local_mode) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='dev') + data = [] + for line in ms_dataset: + row = line if line['label'] == '-': continue - line['label'] = { + row['label'] = { 'entailment': 'A', 'contradiction': 'B', 'neutral': 'C', }[line['label']] - data.append(line) + data.append(row) + else: + data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + if line['label'] == '-': + continue + line['label'] = { + 'entailment': 'A', + 'contradiction': 'B', + 'neutral': 'C', + }[line['label']] + data.append(line) return Dataset.from_list(data) diff --git a/opencompass/datasets/cmrc.py b/opencompass/datasets/cmrc.py index fcb0a847..1b9e1979 100644 --- a/opencompass/datasets/cmrc.py +++ b/opencompass/datasets/cmrc.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class CMRCDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # 将原始数据转换为所需的格式 diff --git a/opencompass/datasets/commonsenseqa.py b/opencompass/datasets/commonsenseqa.py index a78601da..723f5d45 100644 --- a/opencompass/datasets/commonsenseqa.py +++ b/opencompass/datasets/commonsenseqa.py @@ -1,9 +1,11 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,32 +15,51 @@ class commonsenseqaDataset(BaseDataset): @staticmethod def load(path): - dataset = {} - for split, stub in [ - ['train', 'train_rand_split.jsonl'], - ['validation', 'dev_rand_split.jsonl'], - ]: - data_path = os.path.join(path, stub) - dataset_list = [] - with open(data_path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = {} + for split in ['train', 'validation']: + ms_dataset = MsDataset.load(path, split=split) + dataset_list = [] + for line in ms_dataset: + choices = line['choices'] dataset_list.append({ - 'question': - line['question']['stem'], - 'A': - line['question']['choices'][0]['text'], - 'B': - line['question']['choices'][1]['text'], - 'C': - line['question']['choices'][2]['text'], - 'D': - line['question']['choices'][3]['text'], - 'E': - line['question']['choices'][4]['text'], - 'answerKey': - line['answerKey'], + 'question': line['question'], + 'A': choices['text'][0], + 'B': choices['text'][1], + 'C': choices['text'][2], + 'D': choices['text'][3], + 'E': choices['text'][4], + 'answerKey': line['answerKey'], }) - dataset[split] = Dataset.from_list(dataset_list) - + dataset[split] = Dataset.from_list(dataset_list) + else: + dataset = {} + for split, stub in [ + ['train', 'train_rand_split.jsonl'], + ['validation', 'dev_rand_split.jsonl'], + ]: + data_path = os.path.join(path, stub) + dataset_list = [] + with open(data_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + dataset_list.append({ + 'question': + line['question']['stem'], + 'A': + line['question']['choices'][0]['text'], + 'B': + line['question']['choices'][1]['text'], + 'C': + line['question']['choices'][2]['text'], + 'D': + line['question']['choices'][3]['text'], + 'E': + line['question']['choices'][4]['text'], + 'answerKey': + line['answerKey'], + }) + dataset[split] = Dataset.from_list(dataset_list) return DatasetDict(dataset) diff --git a/opencompass/datasets/commonsenseqa_cn.py b/opencompass/datasets/commonsenseqa_cn.py index d764f30e..77ed84f7 100644 --- a/opencompass/datasets/commonsenseqa_cn.py +++ b/opencompass/datasets/commonsenseqa_cn.py @@ -2,6 +2,8 @@ import json from datasets import Dataset, DatasetDict +from opencompass.utils import get_data_path + from .base import BaseDataset @@ -9,6 +11,7 @@ class CommonsenseQADataset_CN(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) datasetdict = DatasetDict() for split in ['train', 'validation']: data = [] diff --git a/opencompass/datasets/copa.py b/opencompass/datasets/copa.py index 3aaa195e..e6d04f70 100644 --- a/opencompass/datasets/copa.py +++ b/opencompass/datasets/copa.py @@ -3,15 +3,17 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class COPADataset_V2(BaseDataset): +class COPADatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/crowspairs.py b/opencompass/datasets/crowspairs.py index 5c095a43..44955345 100644 --- a/opencompass/datasets/crowspairs.py +++ b/opencompass/datasets/crowspairs.py @@ -10,7 +10,7 @@ from .base import BaseDataset @LOAD_DATASET.register_module() -class crowspairsDataset(BaseDataset): +class CrowspairsDataset(BaseDataset): @staticmethod def load(**kwargs): @@ -25,7 +25,7 @@ class crowspairsDataset(BaseDataset): @LOAD_DATASET.register_module() -class crowspairsDataset_V2(BaseDataset): +class CrowspairsDatasetV2(BaseDataset): @staticmethod def load(**kwargs): diff --git a/opencompass/datasets/crowspairs_cn.py b/opencompass/datasets/crowspairs_cn.py index fabbdf5e..bc053497 100644 --- a/opencompass/datasets/crowspairs_cn.py +++ b/opencompass/datasets/crowspairs_cn.py @@ -2,13 +2,17 @@ import json from datasets import Dataset, DatasetDict +from opencompass.utils import get_data_path + from .base import BaseDataset -class CrowspairsDataset_CN(BaseDataset): +class CrowspairsDatasetCN(BaseDataset): + """Chinese version of Crowspairs dataset.""" @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r') as f: for line in f: diff --git a/opencompass/datasets/csl.py b/opencompass/datasets/csl.py index 1994b44c..7d74364f 100644 --- a/opencompass/datasets/csl.py +++ b/opencompass/datasets/csl.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,9 @@ class CslDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -26,10 +30,11 @@ class CslDataset(BaseDataset): @LOAD_DATASET.register_module() -class CslDataset_V2(BaseDataset): +class CslDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/cvalues.py b/opencompass/datasets/cvalues.py index 672ae45e..10ecb172 100644 --- a/opencompass/datasets/cvalues.py +++ b/opencompass/datasets/cvalues.py @@ -3,6 +3,7 @@ import re from datasets import load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,7 +13,7 @@ class CValuesDataset(BaseDataset): @staticmethod def load(path): - + path = get_data_path(path, local_mode=True) dataset = load_dataset('json', data_files=path) def preprocess(example): diff --git a/opencompass/datasets/drcd.py b/opencompass/datasets/drcd.py index 66bd0ca9..47a8b17c 100644 --- a/opencompass/datasets/drcd.py +++ b/opencompass/datasets/drcd.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class DRCDDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # 将原始数据转换为所需的格式 diff --git a/opencompass/datasets/drop_simple_eval.py b/opencompass/datasets/drop_simple_eval.py index 94eee156..ad2863e3 100644 --- a/opencompass/datasets/drop_simple_eval.py +++ b/opencompass/datasets/drop_simple_eval.py @@ -6,6 +6,7 @@ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -39,6 +40,7 @@ class DropOpenAIDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset_list = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/ds1000.py b/opencompass/datasets/ds1000.py index 653e087f..ed55216b 100644 --- a/opencompass/datasets/ds1000.py +++ b/opencompass/datasets/ds1000.py @@ -21,6 +21,7 @@ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -99,6 +100,7 @@ class DS1000Dataset(BaseDataset): are supported. Defaults to `Insertion`. """ + path = get_data_path(path, local_mode=True) if isinstance(libs, str): libs = [libs] diff --git a/opencompass/datasets/eprstmt.py b/opencompass/datasets/eprstmt.py index d333b3cf..91cb73f5 100644 --- a/opencompass/datasets/eprstmt.py +++ b/opencompass/datasets/eprstmt.py @@ -3,15 +3,17 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class eprstmtDataset_V2(BaseDataset): +class EprstmtDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/flames.py b/opencompass/datasets/flames.py index 872172e8..de2202e9 100644 --- a/opencompass/datasets/flames.py +++ b/opencompass/datasets/flames.py @@ -7,6 +7,7 @@ from typing import Optional from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .subjective.subjective_cmp import SubjectiveCmpDataset @@ -40,6 +41,7 @@ class FlamesDataset(SubjectiveCmpDataset): path: str, name: str, ): + path = get_data_path(path, local_mode=True) config = Config(path, f'{name}_config.txt') dataset = [] diff --git a/opencompass/datasets/flores.py b/opencompass/datasets/flores.py index b4d69f8d..c33cfaf3 100644 --- a/opencompass/datasets/flores.py +++ b/opencompass/datasets/flores.py @@ -1,9 +1,11 @@ import os import re +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -27,14 +29,40 @@ class FloresFirst100Dataset(BaseDataset): @staticmethod def load(path, name): + path = get_data_path(path) src_lang, tgt_lang = name.split('-') - dev_dataset = FloresFirst100Dataset.load_single( - os.path.join(path, 'dev', f'{src_lang}.dev'), - os.path.join(path, 'dev', f'{tgt_lang}.dev'), src_lang, tgt_lang) - devtest_dataset = FloresFirst100Dataset.load_single( - os.path.join(path, 'devtest', f'{src_lang}.devtest'), - os.path.join(path, 'devtest', f'{tgt_lang}.devtest'), src_lang, - tgt_lang) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + src_dev = MsDataset.load(path, subset_name=src_lang, split='dev') + src_devtest = MsDataset.load(path, + subset_name=src_lang, + split='devtest') + tgt_dev = MsDataset.load(path, subset_name=tgt_lang, split='dev') + tgt_devtest = MsDataset.load(path, + subset_name=tgt_lang, + split='devtest') + + dev_data_list = [{ + f'sentence_{src_lang}': src_dev[i]['sentence'], + f'sentence_{tgt_lang}': tgt_dev[i]['sentence'], + } for i in range(len(src_dev))] + devtest_data_list = [{ + f'sentence_{src_lang}': + src_devtest[i]['sentence'], + f'sentence_{tgt_lang}': + tgt_devtest[i]['sentence'], + } for i in range(len(src_devtest))] + dev_dataset = Dataset.from_list(dev_data_list) + devtest_dataset = Dataset.from_list(devtest_data_list) + else: + dev_dataset = FloresFirst100Dataset.load_single( + os.path.join(path, 'dev', f'{src_lang}.dev'), + os.path.join(path, 'dev', f'{tgt_lang}.dev'), src_lang, + tgt_lang) + devtest_dataset = FloresFirst100Dataset.load_single( + os.path.join(path, 'devtest', f'{src_lang}.devtest'), + os.path.join(path, 'devtest', f'{tgt_lang}.devtest'), src_lang, + tgt_lang) return DatasetDict({'dev': dev_dataset, 'devtest': devtest_dataset}) diff --git a/opencompass/datasets/game24.py b/opencompass/datasets/game24.py index acfb14ab..5da5ba4a 100644 --- a/opencompass/datasets/game24.py +++ b/opencompass/datasets/game24.py @@ -7,6 +7,7 @@ import pandas as pd from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from .base import BaseDataset @@ -162,6 +163,7 @@ class Game24Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) data = list(pd.read_csv(path)['Puzzles']) data = [{'input': i, 'output': i} for i in data] return Dataset.from_list(data[900:905]) diff --git a/opencompass/datasets/govrepcrs.py b/opencompass/datasets/govrepcrs.py index 4356eda6..513456d6 100644 --- a/opencompass/datasets/govrepcrs.py +++ b/opencompass/datasets/govrepcrs.py @@ -1,6 +1,10 @@ +import json +import os + from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,8 +14,8 @@ class GovRepcrsDataset(BaseDataset): @staticmethod def load(path: str): - import json - import os + path = get_data_path(path, local_mode=True) + dataset_dict = DatasetDict() splits = ['train', 'valid', 'test'] dataset_lists = {x: [] for x in splits} diff --git a/opencompass/datasets/gpqa.py b/opencompass/datasets/gpqa.py index a4c88f37..8599a3cc 100644 --- a/opencompass/datasets/gpqa.py +++ b/opencompass/datasets/gpqa.py @@ -7,6 +7,7 @@ from datasets import Dataset from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -16,6 +17,7 @@ class GPQADataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) cnt = 0 data = [] with open(os.path.join(path, name), 'r', encoding='utf-8') as f: @@ -62,10 +64,12 @@ class GPQAEvaluator(BaseEvaluator): @LOAD_DATASET.register_module() -class GPQADataset_Simple_Eval(BaseDataset): +class GPQASimpleEvalDataset(BaseDataset): + """GPQA dataset compatible with simple-eval.""" @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) n_repeats = 4 data = [] with open(os.path.join(path, name), 'r', encoding='utf-8') as f: diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py index a3baaff8..e90f47e2 100644 --- a/opencompass/datasets/gsm8k.py +++ b/opencompass/datasets/gsm8k.py @@ -1,11 +1,13 @@ import json import os import re +from os import environ from datasets import Dataset, DatasetDict from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -15,16 +17,22 @@ class GSM8KDataset(BaseDataset): @staticmethod def load(path): - datasets = {} - for split in ['train', 'test']: - split_path = os.path.join(path, split + '.jsonl') - dataset = [] - with open(split_path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line.strip()) - dataset.append(line) - datasets[split] = Dataset.from_list(dataset) - return DatasetDict(datasets) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=path, trust_remote_code=True) + else: + datasets = {} + for split in ['train', 'test']: + split_path = os.path.join(path, split + '.jsonl') + dataset = [] + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + datasets[split] = Dataset.from_list(dataset) + dataset = DatasetDict(datasets) + return dataset @TEXT_POSTPROCESSORS.register_module('gsm8k_dataset') diff --git a/opencompass/datasets/gsm_hard.py b/opencompass/datasets/gsm_hard.py index 5a3f31e8..51b972c8 100644 --- a/opencompass/datasets/gsm_hard.py +++ b/opencompass/datasets/gsm_hard.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class GSMHardDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py index f6a89c49..df40489c 100644 --- a/opencompass/datasets/hellaswag.py +++ b/opencompass/datasets/hellaswag.py @@ -1,22 +1,26 @@ import json import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class hellaswagDataset(BaseDataset): +class HellaswagDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path) dataset = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - data = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='validation') + for data in ms_dataset: dataset.append({ 'ctx': data['query'].split(': ', 2)[-1], 'A': data['choices'][0], @@ -25,19 +29,33 @@ class hellaswagDataset(BaseDataset): 'D': data['choices'][3], 'label': data['gold'], }) + else: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset.append({ + 'ctx': data['query'].split(': ', 2)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': data['gold'], + }) dataset = Dataset.from_list(dataset) return dataset @LOAD_DATASET.register_module() -class hellaswagDataset_V2(BaseDataset): +class HellaswagDataset_V2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path) dataset = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - data = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='validation') + for data in ms_dataset: dataset.append({ 'ctx': data['query'].split(': ', 1)[-1], 'A': data['choices'][0], @@ -46,43 +64,8 @@ class hellaswagDataset_V2(BaseDataset): 'D': data['choices'][3], 'label': 'ABCD'[data['gold']], }) - dataset = Dataset.from_list(dataset) - return dataset - - -@LOAD_DATASET.register_module() -class hellaswagDataset_V3(BaseDataset): - - @staticmethod - def load(path): - dataset = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - data = json.loads(line) - dataset.append({ - 'query': data['query'], - 'A': data['choices'][0], - 'B': data['choices'][1], - 'C': data['choices'][2], - 'D': data['choices'][3], - 'gold': data['gold'], - }) - dataset = Dataset.from_list(dataset) - return dataset - - -@LOAD_DATASET.register_module() -class hellaswagDatasetwithICE(BaseDataset): - - @staticmethod - def load(path): - dataset_dict = DatasetDict() - for split, filename in [ - ['train', 'hellaswag_train_sampled25.jsonl'], - ['val', 'hellaswag.jsonl'], - ]: - dataset = [] - with open(osp.join(path, filename), 'r', encoding='utf-8') as f: + else: + with open(path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) dataset.append({ @@ -93,11 +76,88 @@ class hellaswagDatasetwithICE(BaseDataset): 'D': data['choices'][3], 'label': 'ABCD'[data['gold']], }) + dataset = Dataset.from_list(dataset) + return dataset + + +@LOAD_DATASET.register_module() +class HellaswagDataset_V3(BaseDataset): + + @staticmethod + def load(path): + path = get_data_path(path) + dataset = [] + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='validation') + for data in ms_dataset: + dataset.append({ + 'query': data['query'], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'gold': data['gold'], + }) + else: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset.append({ + 'query': data['query'], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'gold': data['gold'], + }) + dataset = Dataset.from_list(dataset) + return dataset + + +@LOAD_DATASET.register_module() +class HellaswagDatasetwithICE(BaseDataset): + + @staticmethod + def load(path): + path = get_data_path(path) + dataset_dict = DatasetDict() + for split, filename in [ + ['train', 'hellaswag_train_sampled25.jsonl'], + ['val', 'hellaswag.jsonl'], + ]: + dataset = [] + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load( + path, split=split if split == 'train' else 'validation') + for data in ms_dataset: + dataset.append({ + 'ctx': data['query'].split(': ', 1)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': 'ABCD'[data['gold']], + }) + else: + with open(osp.join(path, filename), 'r', + encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset.append({ + 'ctx': data['query'].split(': ', 1)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': 'ABCD'[data['gold']], + }) dataset_dict[split] = Dataset.from_list(dataset) return dataset_dict -class hellaswagDatasetClean(BaseDataset): +class HellaswagDatasetClean(BaseDataset): # load the contamination annotations of CEval from # https://github.com/liyucheng09/Contamination_Detector @@ -106,13 +166,22 @@ class hellaswagDatasetClean(BaseDataset): import requests assert split == 'val', 'We only use val set of hellaswag' - annotation_cache_path = osp.join( - path, f'hellaswag_{split}_contamination_annotations.json') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.utils.config_ds import MS_DATASETS_CACHE + annotation_cache_path = osp.join( + MS_DATASETS_CACHE, + f'hellaswag_{split}_contamination_annotations.json') + link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/hellaswag_annotations_with_line_index.json' # noqa + else: + annotation_cache_path = osp.join( + path, f'hellaswag_{split}_contamination_annotations.json') + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/hellaswag_annotations_with_line_index.json' # noqa + if osp.exists(annotation_cache_path): with open(annotation_cache_path, 'r') as f: annotations = json.load(f) return annotations - link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/hellaswag_annotations_with_line_index.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) with open(annotation_cache_path, 'w') as f: json.dump(annotations, f) @@ -120,12 +189,15 @@ class hellaswagDatasetClean(BaseDataset): @staticmethod def load(path): + path = get_data_path(path) dataset = [] - annotations = hellaswagDatasetClean.load_contamination_annotations( + annotations = HellaswagDatasetClean.load_contamination_annotations( osp.dirname(path)) - with open(path, 'r', encoding='utf-8') as f: - for rwo_index, line in enumerate(f): - data = json.loads(line) + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='validation') + for rwo_index, data in enumerate(ms_dataset): rwo_index = f'{rwo_index}' if rwo_index in annotations: is_clean = annotations[rwo_index][0] @@ -140,5 +212,23 @@ class hellaswagDatasetClean(BaseDataset): 'label': data['gold'], 'is_clean': is_clean, }) + else: + with open(path, 'r', encoding='utf-8') as f: + for rwo_index, line in enumerate(f): + data = json.loads(line) + rwo_index = f'{rwo_index}' + if rwo_index in annotations: + is_clean = annotations[rwo_index][0] + else: + is_clean = 'not labeled' + dataset.append({ + 'ctx': data['query'].split(': ', 2)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': data['gold'], + 'is_clean': is_clean, + }) dataset = Dataset.from_list(dataset) return dataset diff --git a/opencompass/datasets/huggingface.py b/opencompass/datasets/huggingface.py index 2ae23e3f..3798ba15 100644 --- a/opencompass/datasets/huggingface.py +++ b/opencompass/datasets/huggingface.py @@ -1,6 +1,7 @@ from datasets import load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,4 +11,7 @@ class HFDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) return load_dataset(**kwargs) diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index 2002e469..e5f602e4 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -5,12 +5,14 @@ import json import os.path as osp import re import tempfile +from os import environ from typing import List from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -26,6 +28,7 @@ cd human-eval pip install -e . pip install -e evalplus''' + @LOAD_DATASET.register_module() class HumanevalDataset(BaseDataset): @@ -46,12 +49,22 @@ class HumanevalDataset(BaseDataset): num_repeats(int): Number of repetition for this dataset to get multiple responses in special cases. """ - dataset = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) - dataset.extend([copy.deepcopy(line) for _ in range(num_repeats)]) - return Dataset.from_list(dataset) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, subset_name='openai_humaneval', split='test') + dataset_list = [] + for example in dataset: + dataset_list.extend([example] * num_repeats) + dataset = Dataset.from_list(dataset_list) + else: + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + dataset.extend( + [json.loads(line.strip()) for _ in range(num_repeats)]) + dataset = Dataset.from_list(dataset) + return dataset class HumanEvalEvaluator(BaseEvaluator): diff --git a/opencompass/datasets/humaneval_multi.py b/opencompass/datasets/humaneval_multi.py index f8f27fe1..3c37e2b2 100644 --- a/opencompass/datasets/humaneval_multi.py +++ b/opencompass/datasets/humaneval_multi.py @@ -13,6 +13,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -60,6 +61,7 @@ class HumanevalMultiDataset(BaseDataset): num_repeats(int): Number of repetition for this dataset to get multiple responses in special cases. """ + path = get_data_path(path, local_mode=True) assert language in _LANGUAGE_NAME_DICT.keys(), ( f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}') assert version in [ diff --git a/opencompass/datasets/humanevalx.py b/opencompass/datasets/humanevalx.py index 6e9c3dff..03901503 100644 --- a/opencompass/datasets/humanevalx.py +++ b/opencompass/datasets/humanevalx.py @@ -12,6 +12,7 @@ from typing import Dict, Iterable from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from .base import BaseDataset from .humaneval import humaneval_postprocess_v2 @@ -30,6 +31,7 @@ class HumanevalXDataset(BaseDataset): @staticmethod def load(path, language, **kwargs): + path = get_data_path(path, local_mode=True) assert language in _LANGUAGE_NAME_DICT.keys(), ( f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}') file_path = osp.join(path, f'humanevalx_{language}.jsonl.gz') diff --git a/opencompass/datasets/hungarian_math.py b/opencompass/datasets/hungarian_math.py index 0a07ef34..2735cfa0 100644 --- a/opencompass/datasets/hungarian_math.py +++ b/opencompass/datasets/hungarian_math.py @@ -2,6 +2,7 @@ import pandas as pd from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -11,6 +12,7 @@ class HungarianExamMathDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) df = pd.read_csv(path) df.columns = ['question'] outputs = [{ diff --git a/opencompass/datasets/inference_ppl.py b/opencompass/datasets/inference_ppl.py index 251bb682..80344cd5 100644 --- a/opencompass/datasets/inference_ppl.py +++ b/opencompass/datasets/inference_ppl.py @@ -4,6 +4,7 @@ from typing import List from datasets import load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class InferencePPLDataset(BaseDataset): @staticmethod def load(path: str, name: List[str] = None, samples: int = None): + path = get_data_path(path, local_mode=True) # Check if file exists in the given path supported_extensions = ['jsonl'] diff --git a/opencompass/datasets/infinitebench/infinitebench_codedebug.py b/opencompass/datasets/infinitebench/infinitebench_codedebug.py index 90041c0e..a0e5fcca 100644 --- a/opencompass/datasets/infinitebench/infinitebench_codedebug.py +++ b/opencompass/datasets/infinitebench/infinitebench_codedebug.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchcodedebugDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_coderun.py b/opencompass/datasets/infinitebench/infinitebench_coderun.py index 9a6a8a2c..965ef913 100644 --- a/opencompass/datasets/infinitebench/infinitebench_coderun.py +++ b/opencompass/datasets/infinitebench/infinitebench_coderun.py @@ -3,6 +3,7 @@ import re from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -13,6 +14,7 @@ class InfiniteBenchcoderunDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_endia.py b/opencompass/datasets/infinitebench/infinitebench_endia.py index a41fa942..6c13a503 100644 --- a/opencompass/datasets/infinitebench/infinitebench_endia.py +++ b/opencompass/datasets/infinitebench/infinitebench_endia.py @@ -4,6 +4,7 @@ from datasets import Dataset from opencompass.openicl import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -14,6 +15,7 @@ class InfiniteBenchendiaDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_enmc.py b/opencompass/datasets/infinitebench/infinitebench_enmc.py index 0843425b..5603edaf 100644 --- a/opencompass/datasets/infinitebench/infinitebench_enmc.py +++ b/opencompass/datasets/infinitebench/infinitebench_enmc.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchenmcDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_enqa.py b/opencompass/datasets/infinitebench/infinitebench_enqa.py index 6201497c..3e5b1261 100644 --- a/opencompass/datasets/infinitebench/infinitebench_enqa.py +++ b/opencompass/datasets/infinitebench/infinitebench_enqa.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchenqaDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_ensum.py b/opencompass/datasets/infinitebench/infinitebench_ensum.py index 1d892d16..3e432f6d 100644 --- a/opencompass/datasets/infinitebench/infinitebench_ensum.py +++ b/opencompass/datasets/infinitebench/infinitebench_ensum.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchensumDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_mathcalc.py b/opencompass/datasets/infinitebench/infinitebench_mathcalc.py index 6dc65156..9b06c75b 100644 --- a/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +++ b/opencompass/datasets/infinitebench/infinitebench_mathcalc.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -15,6 +16,7 @@ class InfiniteBenchmathcalcDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_mathfind.py b/opencompass/datasets/infinitebench/infinitebench_mathfind.py index 177ffd91..2c636ab9 100644 --- a/opencompass/datasets/infinitebench/infinitebench_mathfind.py +++ b/opencompass/datasets/infinitebench/infinitebench_mathfind.py @@ -3,6 +3,7 @@ import re from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -13,6 +14,7 @@ class InfiniteBenchmathfindDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_retrievekv.py b/opencompass/datasets/infinitebench/infinitebench_retrievekv.py index 6ae67806..bcc121f1 100644 --- a/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +++ b/opencompass/datasets/infinitebench/infinitebench_retrievekv.py @@ -4,6 +4,7 @@ from datasets import Dataset from opencompass.openicl import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -14,6 +15,7 @@ class InfiniteBenchretrievekvDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py b/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py index 5742f951..44f44f2e 100644 --- a/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +++ b/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchretrievenumberDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py b/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py index 3cefa118..a94a875d 100644 --- a/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +++ b/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchretrievepasskeyDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/infinitebench/infinitebench_zhqa.py b/opencompass/datasets/infinitebench/infinitebench_zhqa.py index c2ba296a..51958ec1 100644 --- a/opencompass/datasets/infinitebench/infinitebench_zhqa.py +++ b/opencompass/datasets/infinitebench/infinitebench_zhqa.py @@ -1,6 +1,7 @@ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import iter_jsonl @@ -11,6 +12,7 @@ class InfiniteBenchzhqaDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = list(iter_jsonl(path)) diff --git a/opencompass/datasets/jigsawmultilingual.py b/opencompass/datasets/jigsawmultilingual.py index 69b18165..9866809f 100644 --- a/opencompass/datasets/jigsawmultilingual.py +++ b/opencompass/datasets/jigsawmultilingual.py @@ -3,6 +3,7 @@ import csv from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,9 @@ class JigsawMultilingualDataset(BaseDataset): @staticmethod def load(path, label, lang): + path = get_data_path(path, local_mode=True) + label = get_data_path(label, local_mode=True) + assert lang in ['es', 'fr', 'it', 'pt', 'ru', 'tr'] dataset = DatasetDict() diff --git a/opencompass/datasets/jsonl.py b/opencompass/datasets/jsonl.py index 74f9d5c0..d9437488 100644 --- a/opencompass/datasets/jsonl.py +++ b/opencompass/datasets/jsonl.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class JsonlDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: diff --git a/opencompass/datasets/kaoshi.py b/opencompass/datasets/kaoshi.py index 96a7d083..458611ae 100644 --- a/opencompass/datasets/kaoshi.py +++ b/opencompass/datasets/kaoshi.py @@ -4,6 +4,7 @@ import re from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from .base import BaseDataset @@ -20,6 +21,7 @@ class KaoshiDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) data_list = [] with open(path, encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/lambada.py b/opencompass/datasets/lambada.py index cf8266f5..1f3dcea2 100644 --- a/opencompass/datasets/lambada.py +++ b/opencompass/datasets/lambada.py @@ -1,11 +1,13 @@ import json import re import string +from os import environ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset @@ -16,12 +18,18 @@ class lambadaDataset(BaseDataset): @staticmethod def load(path): - dataset = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - dataset.append(json.loads(line)) - dataset = Dataset.from_list(dataset) - return DatasetDict({'test': dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path) + return dataset + else: + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + dataset.append(json.loads(line)) + dataset = Dataset.from_list(dataset) + return DatasetDict({'test': dataset}) @ICL_EVALUATORS.register_module() diff --git a/opencompass/datasets/lawbench/lawbench.py b/opencompass/datasets/lawbench/lawbench.py index 2dcc10fd..f4e95fb1 100644 --- a/opencompass/datasets/lawbench/lawbench.py +++ b/opencompass/datasets/lawbench/lawbench.py @@ -5,6 +5,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .evaluation_functions import (cjft, flzx, ftcs, jdzy, jec_ac, jec_kd, @@ -18,6 +19,7 @@ class LawBenchDataset(BaseDataset): @staticmethod def load(path: str, index: str) -> Dataset: + path = get_data_path(path, local_mode=True) path = os.path.join(path, index + '.json') with open(path, 'r') as f: data = json.load(f) diff --git a/opencompass/datasets/lcsts.py b/opencompass/datasets/lcsts.py index 92648f97..afe42af7 100644 --- a/opencompass/datasets/lcsts.py +++ b/opencompass/datasets/lcsts.py @@ -1,8 +1,10 @@ import os.path as osp +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,22 +14,35 @@ class LCSTSDataset(BaseDataset): @staticmethod def load(path: str): - src_path = osp.join(path, 'test.src.txt') - tgt_path = osp.join(path, 'test.tgt.txt') + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='test') + dataset = [] + for row in ms_dataset: + new_row = {} + new_row['content'] = row['text'] + new_row['abst'] = row['summary'] + dataset.append(new_row) + dataset = Dataset.from_list(dataset) + else: + src_path = osp.join(path, 'test.src.txt') + tgt_path = osp.join(path, 'test.tgt.txt') - src_lines = open(src_path, 'r', encoding='utf-8').readlines() - tgt_lines = open(tgt_path, 'r', encoding='utf-8').readlines() + src_lines = open(src_path, 'r', encoding='utf-8').readlines() + tgt_lines = open(tgt_path, 'r', encoding='utf-8').readlines() - data = {'content': [], 'abst': []} + data = {'content': [], 'abst': []} - for _, (src_text, tgt_text) in enumerate(zip(src_lines, tgt_lines)): - data['content'].append(src_text.strip()) - data['abst'].append(tgt_text.strip()) + for _, (src_text, tgt_text) in enumerate(zip(src_lines, + tgt_lines)): + data['content'].append(src_text.strip()) + data['abst'].append(tgt_text.strip()) - dataset = Dataset.from_dict({ - 'content': data['content'], - 'abst': data['abst'] - }) + dataset = Dataset.from_dict({ + 'content': data['content'], + 'abst': data['abst'] + }) return dataset diff --git a/opencompass/datasets/leval/leval_coursera.py b/opencompass/datasets/leval/leval_coursera.py index ebfab22f..7e20ded5 100644 --- a/opencompass/datasets/leval/leval_coursera.py +++ b/opencompass/datasets/leval/leval_coursera.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalCourseraDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_financial_qa.py b/opencompass/datasets/leval/leval_financial_qa.py index 658f7961..626b8f73 100644 --- a/opencompass/datasets/leval/leval_financial_qa.py +++ b/opencompass/datasets/leval/leval_financial_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalFinancialQADataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_gov_report_summ.py b/opencompass/datasets/leval/leval_gov_report_summ.py index cd0a46f5..9910a5e0 100644 --- a/opencompass/datasets/leval/leval_gov_report_summ.py +++ b/opencompass/datasets/leval/leval_gov_report_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalGovReportSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_gsm100.py b/opencompass/datasets/leval/leval_gsm100.py index 0ddb7b04..356230a0 100644 --- a/opencompass/datasets/leval/leval_gsm100.py +++ b/opencompass/datasets/leval/leval_gsm100.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -41,6 +42,9 @@ class LEvalGSM100Dataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_legal_contract_qa.py b/opencompass/datasets/leval/leval_legal_contract_qa.py index fbb30cc2..45ce1bba 100644 --- a/opencompass/datasets/leval/leval_legal_contract_qa.py +++ b/opencompass/datasets/leval/leval_legal_contract_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalLegalContractQADataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_meeting_summ.py b/opencompass/datasets/leval/leval_meeting_summ.py index ae0f4463..1d5bf6e9 100644 --- a/opencompass/datasets/leval/leval_meeting_summ.py +++ b/opencompass/datasets/leval/leval_meeting_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalMeetingSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_multidoc_qa.py b/opencompass/datasets/leval/leval_multidoc_qa.py index f8ff4b88..f5c99f6f 100644 --- a/opencompass/datasets/leval/leval_multidoc_qa.py +++ b/opencompass/datasets/leval/leval_multidoc_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalMultidocQADataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_narrattive_qa.py b/opencompass/datasets/leval/leval_narrattive_qa.py index 46d12a7f..13fdc3f3 100644 --- a/opencompass/datasets/leval/leval_narrattive_qa.py +++ b/opencompass/datasets/leval/leval_narrattive_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalNarrativeQADataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_natural_question.py b/opencompass/datasets/leval/leval_natural_question.py index 5771fd81..a569d7cb 100644 --- a/opencompass/datasets/leval/leval_natural_question.py +++ b/opencompass/datasets/leval/leval_natural_question.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalNaturalQuestionDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_news_summ.py b/opencompass/datasets/leval/leval_news_summ.py index e7b1ec6d..96cdb1f0 100644 --- a/opencompass/datasets/leval/leval_news_summ.py +++ b/opencompass/datasets/leval/leval_news_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalNewsSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_paper_assistant.py b/opencompass/datasets/leval/leval_paper_assistant.py index 16292846..26c48fde 100644 --- a/opencompass/datasets/leval/leval_paper_assistant.py +++ b/opencompass/datasets/leval/leval_paper_assistant.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalPaperAssistantDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_patent_summ.py b/opencompass/datasets/leval/leval_patent_summ.py index 3811d91d..1de1e6e0 100644 --- a/opencompass/datasets/leval/leval_patent_summ.py +++ b/opencompass/datasets/leval/leval_patent_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalPatentSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_quality.py b/opencompass/datasets/leval/leval_quality.py index ab1517a1..6abc77cf 100644 --- a/opencompass/datasets/leval/leval_quality.py +++ b/opencompass/datasets/leval/leval_quality.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalQualityDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_review_summ.py b/opencompass/datasets/leval/leval_review_summ.py index 7c610fdb..bf3e3e0e 100644 --- a/opencompass/datasets/leval/leval_review_summ.py +++ b/opencompass/datasets/leval/leval_review_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalReviewSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_scientific_qa.py b/opencompass/datasets/leval/leval_scientific_qa.py index a24e94df..1b782bed 100644 --- a/opencompass/datasets/leval/leval_scientific_qa.py +++ b/opencompass/datasets/leval/leval_scientific_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalScientificQADataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_topic_retrieval.py b/opencompass/datasets/leval/leval_topic_retrieval.py index f077048e..fe1705ca 100644 --- a/opencompass/datasets/leval/leval_topic_retrieval.py +++ b/opencompass/datasets/leval/leval_topic_retrieval.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalTopicRetrievalDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_tpo.py b/opencompass/datasets/leval/leval_tpo.py index 7ffecdeb..8e5577c1 100644 --- a/opencompass/datasets/leval/leval_tpo.py +++ b/opencompass/datasets/leval/leval_tpo.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalTPODataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/leval/leval_tvshow_summ.py b/opencompass/datasets/leval/leval_tvshow_summ.py index c0364ab8..0c829df6 100644 --- a/opencompass/datasets/leval/leval_tvshow_summ.py +++ b/opencompass/datasets/leval/leval_tvshow_summ.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LEvalTVShowSummDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/llm_compression.py b/opencompass/datasets/llm_compression.py index 6462c641..e23924fe 100644 --- a/opencompass/datasets/llm_compression.py +++ b/opencompass/datasets/llm_compression.py @@ -4,6 +4,7 @@ from typing import List from datasets import load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class LLMCompressionDataset(BaseDataset): @staticmethod def load(path: str, name: List[str] = None, samples: int = None): + path = get_data_path(path, local_mode=True) # Check if file exists in the given path supported_extensions = ['json', 'jsonl'] diff --git a/opencompass/datasets/longbench/longbench_2wikim_qa.py b/opencompass/datasets/longbench/longbench_2wikim_qa.py index e23c031d..d435c41e 100644 --- a/opencompass/datasets/longbench/longbench_2wikim_qa.py +++ b/opencompass/datasets/longbench/longbench_2wikim_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBench2wikimqaDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_dureader.py b/opencompass/datasets/longbench/longbench_dureader.py index 80b94a52..28d013ce 100644 --- a/opencompass/datasets/longbench/longbench_dureader.py +++ b/opencompass/datasets/longbench/longbench_dureader.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchdureaderDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_gov_report.py b/opencompass/datasets/longbench/longbench_gov_report.py index cf754bf0..41a3cf73 100644 --- a/opencompass/datasets/longbench/longbench_gov_report.py +++ b/opencompass/datasets/longbench/longbench_gov_report.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchgov_reportDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_hotpot_qa.py b/opencompass/datasets/longbench/longbench_hotpot_qa.py index 62747a52..45734df6 100644 --- a/opencompass/datasets/longbench/longbench_hotpot_qa.py +++ b/opencompass/datasets/longbench/longbench_hotpot_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchhotpotqaDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_lcc.py b/opencompass/datasets/longbench/longbench_lcc.py index c7c40122..f7f94b6f 100644 --- a/opencompass/datasets/longbench/longbench_lcc.py +++ b/opencompass/datasets/longbench/longbench_lcc.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchlccDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_lsht.py b/opencompass/datasets/longbench/longbench_lsht.py index 99cb4127..7916b046 100644 --- a/opencompass/datasets/longbench/longbench_lsht.py +++ b/opencompass/datasets/longbench/longbench_lsht.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchlshtDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_multi_news.py b/opencompass/datasets/longbench/longbench_multi_news.py index 0ade0f54..fe746f37 100644 --- a/opencompass/datasets/longbench/longbench_multi_news.py +++ b/opencompass/datasets/longbench/longbench_multi_news.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchmulti_newsDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_multifieldqa_en.py b/opencompass/datasets/longbench/longbench_multifieldqa_en.py index f81628a2..9272d5c2 100644 --- a/opencompass/datasets/longbench/longbench_multifieldqa_en.py +++ b/opencompass/datasets/longbench/longbench_multifieldqa_en.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchmultifieldqa_enDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_multifieldqa_zh.py b/opencompass/datasets/longbench/longbench_multifieldqa_zh.py index f1efc593..f1b9f6e6 100644 --- a/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +++ b/opencompass/datasets/longbench/longbench_multifieldqa_zh.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchmultifieldqa_zhDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_musique.py b/opencompass/datasets/longbench/longbench_musique.py index bb9d0176..836777c1 100644 --- a/opencompass/datasets/longbench/longbench_musique.py +++ b/opencompass/datasets/longbench/longbench_musique.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchmusiqueDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_narrative_qa.py b/opencompass/datasets/longbench/longbench_narrative_qa.py index fd64860a..54378da2 100644 --- a/opencompass/datasets/longbench/longbench_narrative_qa.py +++ b/opencompass/datasets/longbench/longbench_narrative_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchnarrativeqaDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_passage_count.py b/opencompass/datasets/longbench/longbench_passage_count.py index 10096d49..d72ea97a 100644 --- a/opencompass/datasets/longbench/longbench_passage_count.py +++ b/opencompass/datasets/longbench/longbench_passage_count.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchpassage_countDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_passage_retrieval_en.py b/opencompass/datasets/longbench/longbench_passage_retrieval_en.py index 56cd9942..1dc0612f 100644 --- a/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +++ b/opencompass/datasets/longbench/longbench_passage_retrieval_en.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchpassage_retrieval_enDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py b/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py index 02568ff2..50b3ba0d 100644 --- a/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +++ b/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchpassage_retrieval_zhDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_qasper.py b/opencompass/datasets/longbench/longbench_qasper.py index b783585c..e1518daa 100644 --- a/opencompass/datasets/longbench/longbench_qasper.py +++ b/opencompass/datasets/longbench/longbench_qasper.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchqasperDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_qmsum.py b/opencompass/datasets/longbench/longbench_qmsum.py index 74cfd858..06fd4310 100644 --- a/opencompass/datasets/longbench/longbench_qmsum.py +++ b/opencompass/datasets/longbench/longbench_qmsum.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchqmsumDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_repobench.py b/opencompass/datasets/longbench/longbench_repobench.py index e2fcaa85..1fed2331 100644 --- a/opencompass/datasets/longbench/longbench_repobench.py +++ b/opencompass/datasets/longbench/longbench_repobench.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchrepobenchDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_samsum.py b/opencompass/datasets/longbench/longbench_samsum.py index 096f9a0f..2cd5b808 100644 --- a/opencompass/datasets/longbench/longbench_samsum.py +++ b/opencompass/datasets/longbench/longbench_samsum.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchsamsumDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_trec.py b/opencompass/datasets/longbench/longbench_trec.py index c70d0008..b0b0fffc 100644 --- a/opencompass/datasets/longbench/longbench_trec.py +++ b/opencompass/datasets/longbench/longbench_trec.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchtrecDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_trivia_qa.py b/opencompass/datasets/longbench/longbench_trivia_qa.py index de52d7e0..19a84a03 100644 --- a/opencompass/datasets/longbench/longbench_trivia_qa.py +++ b/opencompass/datasets/longbench/longbench_trivia_qa.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LongBenchtriviaqaDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/longbench/longbench_vcsum.py b/opencompass/datasets/longbench/longbench_vcsum.py index f14fe9de..2aef8fe9 100644 --- a/opencompass/datasets/longbench/longbench_vcsum.py +++ b/opencompass/datasets/longbench/longbench_vcsum.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,8 @@ class LongBenchvcsumDataset(BaseDataset): @staticmethod def load(**kwargs): + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_cmrc_mixup.py b/opencompass/datasets/lveval/lveval_cmrc_mixup.py index 280e007b..f30fc44d 100644 --- a/opencompass/datasets/lveval/lveval_cmrc_mixup.py +++ b/opencompass/datasets/lveval/lveval_cmrc_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalcmrcDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_dureader_mixup.py b/opencompass/datasets/lveval/lveval_dureader_mixup.py index 4b2a6627..6b48ae80 100644 --- a/opencompass/datasets/lveval/lveval_dureader_mixup.py +++ b/opencompass/datasets/lveval/lveval_dureader_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvaldureaderDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_factrecall_en.py b/opencompass/datasets/lveval/lveval_factrecall_en.py index 7194da6f..b864a209 100644 --- a/opencompass/datasets/lveval/lveval_factrecall_en.py +++ b/opencompass/datasets/lveval/lveval_factrecall_en.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalfactrecallenDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_factrecall_zh.py b/opencompass/datasets/lveval/lveval_factrecall_zh.py index 3ffdd7d9..f91c7748 100644 --- a/opencompass/datasets/lveval/lveval_factrecall_zh.py +++ b/opencompass/datasets/lveval/lveval_factrecall_zh.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalfactrecallzhDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py b/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py index 2fa5509b..b2b4191d 100644 --- a/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +++ b/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalhotpotwikiqaDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_lic_mixup.py b/opencompass/datasets/lveval/lveval_lic_mixup.py index 6d806643..04e78d41 100644 --- a/opencompass/datasets/lveval/lveval_lic_mixup.py +++ b/opencompass/datasets/lveval/lveval_lic_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvallicDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py b/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py index cec01a53..8ae74172 100644 --- a/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +++ b/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvallooglecrDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py b/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py index e7307a08..71f17a5f 100644 --- a/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +++ b/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvallooglemirDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py b/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py index 7a5e75fc..b30f00ef 100644 --- a/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +++ b/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvallooglesdDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py b/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py index 8fc38927..c045c364 100644 --- a/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +++ b/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalmultifieldqaenDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py b/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py index e2fc2d9d..3a8abaef 100644 --- a/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +++ b/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py @@ -1,6 +1,7 @@ from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -10,6 +11,9 @@ class LVEvalmultifieldqazhDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) split = 'test' raw_data = [] diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index 19e38baf..a2cf55a1 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -1,11 +1,13 @@ import json import re +from os import environ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path from .base import BaseDataset @@ -139,16 +141,28 @@ class MATHDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) dataset = DatasetDict() - data = json.load(open(path)) raw_data = [] - for i in data.keys(): - raw_data.append({ - 'problem': - data[i]['problem'], - 'solution': - extract_boxed_answer(data[i]['solution']) - }) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='train') + for item in ms_dataset: + raw_data.append({ + 'problem': + item['problem'], + 'solution': + extract_boxed_answer(item['solution']) + }) + else: + data = json.load(open(path)) + for i in data.keys(): + raw_data.append({ + 'problem': + data[i]['problem'], + 'solution': + extract_boxed_answer(data[i]['solution']) + }) dataset['test'] = Dataset.from_list(raw_data) dataset['train'] = Dataset.from_list(raw_data) return dataset diff --git a/opencompass/datasets/mathbench.py b/opencompass/datasets/mathbench.py index 7d9c250a..cafedec8 100644 --- a/opencompass/datasets/mathbench.py +++ b/opencompass/datasets/mathbench.py @@ -6,6 +6,7 @@ import re from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -55,6 +56,7 @@ class MathBenchDataset(BaseDataset): with_circular (bool): Whether to create circular dataset for single choice question. Defaults to True. """ + path = get_data_path(path, local_mode=True) data = [] filename = osp.join(path, f'{name}.jsonl') with open(filename, 'r', encoding='utf-8') as infile: diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index 35ea6d30..fca83b31 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -9,6 +9,7 @@ import signal import tempfile from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed +from os import environ from typing import List, Sequence, Union import numpy as np @@ -16,6 +17,7 @@ from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -24,7 +26,8 @@ from .base import BaseDataset class MBPPDataset(BaseDataset): @staticmethod - def load(path: str): + def load(path: str, local_mode: bool = False): + path = get_data_path(path, local_mode=local_mode) def processing_test(example): example['test_case'] = example['test_list'] @@ -32,14 +35,23 @@ class MBPPDataset(BaseDataset): example['test_list_2'] = example['test_list'] return example - train = load_dataset('json', data_files=path, - split='train[:10]').map(processing_test) - test = load_dataset('json', data_files=path, - split='train[10:510]').map(processing_test) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + train = MsDataset.load(path, + subset_name='full', + split='train[:10]').map(processing_test) + test = MsDataset.load(path, + subset_name='full', + split='train[10:510]').map(processing_test) + else: + train = load_dataset('json', data_files=path, + split='train[:10]').map(processing_test) + test = load_dataset('json', data_files=path, + split='train[10:510]').map(processing_test) return DatasetDict({'train': train, 'test': test}) -class MBPPDataset_V2(BaseDataset): +class MBPPDatasetV2(BaseDataset): @staticmethod def load(path: str, num_repeats: int = 1): @@ -59,6 +71,8 @@ class MBPPDataset_V2(BaseDataset): multiple responses in special cases. """ + path = get_data_path(path) + def processing_test(example): example['test_case'] = example['test_list'] example['test_list'] = '\n'.join(example['test_list']) @@ -66,10 +80,19 @@ class MBPPDataset_V2(BaseDataset): task_id=example['task_id']) return example - train = load_dataset('json', data_files=path, - split='train[:10]').map(processing_test) - test = load_dataset('json', data_files=path, - split='train[10:510]').map(processing_test) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + train = MsDataset.load(path, + subset_name='full', + split='train[:10]').map(processing_test) + test = MsDataset.load(path, + subset_name='full', + split='train[10:510]').map(processing_test) + else: + train = load_dataset('json', data_files=path, + split='train[:10]').map(processing_test) + test = load_dataset('json', data_files=path, + split='train[10:510]').map(processing_test) test = concatenate_datasets([test] * num_repeats) return DatasetDict({'train': train, 'test': test}) @@ -93,6 +116,7 @@ class SanitizedMBPPDataset(BaseDataset): num_repeats(int): Number of repetition for this dataset to get multiple responses in special cases. """ + path = get_data_path(path) def processing_test(example): example['text'] = example.pop('prompt') @@ -105,10 +129,19 @@ class SanitizedMBPPDataset(BaseDataset): return example # train : test = 7 : 257 - train = load_dataset('json', data_files=path, - split='train[:7]').map(processing_test) - test = load_dataset('json', data_files=path, - split='train[7:264]').map(processing_test) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + train = MsDataset.load(path, + subset_name='sanitized', + split='train[:7]').map(processing_test) + test = MsDataset.load(path, + subset_name='sanitized', + split='train[7:264]').map(processing_test) + else: + train = load_dataset('json', data_files=path, + split='train[:7]').map(processing_test) + test = load_dataset('json', data_files=path, + split='train[7:264]').map(processing_test) test = concatenate_datasets([test] * num_repeats) return DatasetDict({'train': train, 'test': test}) @@ -134,6 +167,8 @@ class MBPPPlusDataset(BaseDataset): multiple responses in special cases. """ + path = get_data_path(path) + def processing_test(example): example['test_case'] = example['test_list'] example['test_list'] = '\n'.join(example['test_list']) diff --git a/opencompass/datasets/medbench/medbench.py b/opencompass/datasets/medbench/medbench.py index dfaaced1..54690947 100644 --- a/opencompass/datasets/medbench/medbench.py +++ b/opencompass/datasets/medbench/medbench.py @@ -5,6 +5,7 @@ from datasets import Dataset from sklearn.metrics import classification_report from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset from .math_equivalence import is_equiv @@ -23,6 +24,7 @@ class MedBenchDataset(BaseDataset): @staticmethod def load(path: str, name: str, setting_name: str): + path = get_data_path(path, local_mode=True) from .dataset_loader import load_dataset, load_dataset_as_result_schema assert setting_name in 'zero-shot', 'only support zero-shot setting' diff --git a/opencompass/datasets/mgsm.py b/opencompass/datasets/mgsm.py index a9d751a4..2d9e1eb3 100644 --- a/opencompass/datasets/mgsm.py +++ b/opencompass/datasets/mgsm.py @@ -4,6 +4,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,7 @@ class MGSMSDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) src_lines = open(path, 'r', encoding='utf-8').readlines() data = {'question': [], 'answer': []} for lines in src_lines: diff --git a/opencompass/datasets/mmlu.py b/opencompass/datasets/mmlu.py index 76cd57f2..cac904d7 100644 --- a/opencompass/datasets/mmlu.py +++ b/opencompass/datasets/mmlu.py @@ -1,10 +1,12 @@ import csv import json import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -14,23 +16,43 @@ class MMLUDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path) dataset = DatasetDict() - for split in ['dev', 'test']: - raw_data = [] - filename = osp.join(path, split, f'{name}_{split}.csv') - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - for row in reader: - assert len(row) == 6 - raw_data.append({ - 'input': row[0], - 'A': row[1], - 'B': row[2], - 'C': row[3], - 'D': row[4], - 'target': row[5], + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['dev', 'test']: + # 从 ModelScope 加载数据 + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + dataset_list = [] + for line in ms_dataset: + dataset_list.append({ + 'input': line['question'], + 'A': line['choices'][0], + 'B': line['choices'][1], + 'C': line['choices'][2], + 'D': line['choices'][3], + 'target': 'ABCD'[line['answer']], }) - dataset[split] = Dataset.from_list(raw_data) + dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}_{split}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + for row in reader: + assert len(row) == 6 + raw_data.append({ + 'input': row[0], + 'A': row[1], + 'B': row[2], + 'C': row[3], + 'D': row[4], + 'target': row[5], + }) + dataset[split] = Dataset.from_list(raw_data) return dataset @@ -43,13 +65,22 @@ class MMLUDatasetClean(BaseDataset): import requests assert split == 'test', 'We only use test set for MMLU' - annotation_cache_path = osp.join( - path, split, f'MMLU_{split}_contamination_annotations.json') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.utils.config_ds import MS_DATASETS_CACHE + annotation_cache_path = osp.join( + MS_DATASETS_CACHE, + f'MMLU_{split}_contamination_annotations.json') + link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/mmlu_annotations.json' # noqa + else: + annotation_cache_path = osp.join( + path, split, f'MMLU_{split}_contamination_annotations.json') + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/mmlu_annotations.json' # noqa + if osp.exists(annotation_cache_path): with open(annotation_cache_path, 'r') as f: annotations = json.load(f) return annotations - link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/mmlu_annotations.json' # noqa + annotations = json.loads(requests.get(link_of_annotations).text) with open(annotation_cache_path, 'w') as f: json.dump(annotations, f) @@ -57,24 +88,29 @@ class MMLUDatasetClean(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path) dataset = DatasetDict() - for split in ['dev', 'test']: - raw_data = [] - filename = osp.join(path, split, f'{name}_{split}.csv') - if split == 'test': - annotations = MMLUDatasetClean.load_contamination_annotations( - path, split) - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - for row_index, row in enumerate(reader): - assert len(row) == 6 + if environ.get('DATASET_SOURCE') == 'ModelScope': + for split in ['dev', 'test']: + from modelscope import MsDataset + + # 从 ModelScope 加载数据 + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + if split == 'test': + annotations = \ + MMLUDatasetClean.load_contamination_annotations( + path, split) + dataset_list = [] + for row_index, line in enumerate(ms_dataset): item = { - 'input': row[0], - 'A': row[1], - 'B': row[2], - 'C': row[3], - 'D': row[4], - 'target': row[5], + 'input': line['question'], + 'A': line['choices'][0], + 'B': line['choices'][1], + 'C': line['choices'][2], + 'D': line['choices'][3], + 'target': 'ABCD'[line['answer']], } if split == 'test': row_id = f'{name} {row_index}' @@ -83,6 +119,35 @@ class MMLUDatasetClean(BaseDataset): else: is_clean = 'not labeled' item['is_clean'] = is_clean - raw_data.append(item) - dataset[split] = Dataset.from_list(raw_data) + dataset_list.append(item) + dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}_{split}.csv') + if split == 'test': + annotations = \ + MMLUDatasetClean.load_contamination_annotations( + path, split) + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + for row_index, row in enumerate(reader): + assert len(row) == 6 + item = { + 'input': row[0], + 'A': row[1], + 'B': row[2], + 'C': row[3], + 'D': row[4], + 'target': row[5], + } + if split == 'test': + row_id = f'{name} {row_index}' + if row_id in annotations: + is_clean = annotations[row_id][0] + else: + is_clean = 'not labeled' + item['is_clean'] = is_clean + raw_data.append(item) + dataset[split] = Dataset.from_list(raw_data) return dataset diff --git a/opencompass/datasets/multirc.py b/opencompass/datasets/multirc.py index 4f607b8a..7c8261e1 100644 --- a/opencompass/datasets/multirc.py +++ b/opencompass/datasets/multirc.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class MultiRCDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r', errors='ignore') as in_f: rows = [] for line in in_f: @@ -41,10 +43,11 @@ class MultiRCDataset(BaseDataset): @LOAD_DATASET.register_module() -class MultiRCDataset_V2(BaseDataset): +class MultiRCDatasetV2(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r', errors='ignore') as in_f: rows = [] for line in in_f: diff --git a/opencompass/datasets/narrativeqa.py b/opencompass/datasets/narrativeqa.py index 4cdbe1c6..93a59cdd 100644 --- a/opencompass/datasets/narrativeqa.py +++ b/opencompass/datasets/narrativeqa.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class NarrativeQADataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) import csv import os dataset_dict = DatasetDict() diff --git a/opencompass/datasets/natural_question.py b/opencompass/datasets/natural_question.py index 8ca61aa7..ab8356cd 100644 --- a/opencompass/datasets/natural_question.py +++ b/opencompass/datasets/natural_question.py @@ -1,11 +1,13 @@ import csv import json import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset @@ -16,21 +18,35 @@ class NaturalQuestionDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) dataset = DatasetDict() for split in ['dev', 'test']: - filename = osp.join(path, f'nq-{split}.qa.csv') - with open(filename, 'r', encoding='utf-8') as f: - reader = csv.reader(f, delimiter='\t') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.msdatasets import MsDataset + ms_dataset = MsDataset.load(path, split=split) raw_data = [] - for row in reader: - assert len(row) == 2 - question = row[0] - answers = eval(row[1]) + for row in ms_dataset: + question = row['question'] + answers = eval(row['answer']) if split == 'dev': answers = answers[0] raw_data.append({'question': question, 'answer': answers}) - dataset[split] = Dataset.from_list(raw_data) - + else: + filename = osp.join(path, f'nq-{split}.qa.csv') + with open(filename, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='\t') + raw_data = [] + for row in reader: + assert len(row) == 2 + question = row[0] + answers = eval(row[1]) + if split == 'dev': + answers = answers[0] + raw_data.append({ + 'question': question, + 'answer': answers + }) + dataset[split] = Dataset.from_list(raw_data) return dataset @@ -39,6 +55,7 @@ class NQOpenDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() for split in ['validation', 'train']: filename = osp.join(path, f'nq-open-{split}.jsonl') diff --git a/opencompass/datasets/natural_question_cn.py b/opencompass/datasets/natural_question_cn.py index 82e13f35..bbdc6f27 100644 --- a/opencompass/datasets/natural_question_cn.py +++ b/opencompass/datasets/natural_question_cn.py @@ -4,15 +4,17 @@ import os.path as osp from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset -class NaturalQuestionDataset_CN(BaseDataset): +class NaturalQuestionDatasetCN(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() for split in ['dev', 'test']: filename = osp.join(path, f'{split}.jsonl') @@ -28,7 +30,7 @@ class NaturalQuestionDataset_CN(BaseDataset): return dataset -class NQEvaluator_CN(BaseEvaluator): +class NQEvaluatorCN(BaseEvaluator): def score(self, predictions, references): if len(predictions) != len(references): diff --git a/opencompass/datasets/obqa.py b/opencompass/datasets/obqa.py index cd9cb4bc..66354e2c 100644 --- a/opencompass/datasets/obqa.py +++ b/opencompass/datasets/obqa.py @@ -1,8 +1,10 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -11,46 +13,83 @@ from .base import BaseDataset class OBQADataset(BaseDataset): @staticmethod - def load(path): + def load(path, name='main'): + path = get_data_path(path) dataset_list = [] - with open(path, 'r') as f: - for line in f: - line = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, subset_name=name, split='test') + for line in ms_dataset: item = { - 'A': line['question']['choices'][0]['text'], - 'B': line['question']['choices'][1]['text'], - 'C': line['question']['choices'][2]['text'], - 'D': line['question']['choices'][3]['text'], - 'question_stem': line['question']['stem'], + 'A': line['choices']['text'][0], + 'B': line['choices']['text'][1], + 'C': line['choices']['text'][2], + 'D': line['choices']['text'][3], + 'question_stem': line['question_stem'], 'answerKey': line['answerKey'], } if 'fact1' in line: item['fact1'] = line['fact1'] dataset_list.append(item) + else: + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + item = { + 'A': line['question']['choices'][0]['text'], + 'B': line['question']['choices'][1]['text'], + 'C': line['question']['choices'][2]['text'], + 'D': line['question']['choices'][3]['text'], + 'question_stem': line['question']['stem'], + 'answerKey': line['answerKey'], + } + if 'fact1' in line: + item['fact1'] = line['fact1'] + dataset_list.append(item) return Dataset.from_list(dataset_list) @LOAD_DATASET.register_module() -class OBQADataset_V2(BaseDataset): +class OBQADatasetV2(BaseDataset): @staticmethod - def load(path): + def load(path, name='main'): + path = get_data_path(path) dataset_list = [] - with open(path, 'r') as f: - for line in f: - line = json.loads(line) - question = line['question']['stem'] + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, subset_name=name, split='test') + for line in ms_dataset: + question = line['question_stem'] if not question.endswith('?'): question += ' what?' item = { - 'A': line['question']['choices'][0]['text'], - 'B': line['question']['choices'][1]['text'], - 'C': line['question']['choices'][2]['text'], - 'D': line['question']['choices'][3]['text'], + 'A': line['choices']['text'][0], + 'B': line['choices']['text'][1], + 'C': line['choices']['text'][2], + 'D': line['choices']['text'][3], 'question_stem': question, 'answerKey': line['answerKey'], } if 'fact1' in line: item['fact1'] = line['fact1'] dataset_list.append(item) + else: + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + question = line['question']['stem'] + if not question.endswith('?'): + question += ' what?' + item = { + 'A': line['question']['choices'][0]['text'], + 'B': line['question']['choices'][1]['text'], + 'C': line['question']['choices'][2]['text'], + 'D': line['question']['choices'][3]['text'], + 'question_stem': question, + 'answerKey': line['answerKey'], + } + if 'fact1' in line: + item['fact1'] = line['fact1'] + dataset_list.append(item) return Dataset.from_list(dataset_list) diff --git a/opencompass/datasets/piqa.py b/opencompass/datasets/piqa.py index 89ac5ec2..456eb464 100644 --- a/opencompass/datasets/piqa.py +++ b/opencompass/datasets/piqa.py @@ -1,15 +1,17 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class piqaDataset(BaseDataset): +class PIQADataset(BaseDataset): @staticmethod def load_single(path, data_filename, label_filename): @@ -24,21 +26,35 @@ class piqaDataset(BaseDataset): for data, label in zip(data_lines, label_lines): i = json.loads(data.strip()) i['label'] = int(label.strip()) + del i['id'] dataset.append(i) return Dataset.from_list(dataset) @staticmethod def load(path): - train_dataset = piqaDataset.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = piqaDataset.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path) + dataset = DatasetDict({ + 'train': ms_dataset['train'], + 'validation': ms_dataset['validation'] + }) + else: + train_dataset = PIQADataset.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = PIQADataset.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + dataset = DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) + return dataset @LOAD_DATASET.register_module() -class piqaDataset_V2(BaseDataset): +class PIQADatasetV2(BaseDataset): @staticmethod def load_single(path, data_filename, label_filename): @@ -57,21 +73,47 @@ class piqaDataset_V2(BaseDataset): i['answer'] = 'NULL' else: i['answer'] = 'AB'[label] + del i['id'] dataset.append(i) return Dataset.from_list(dataset) @staticmethod def load(path): - train_dataset = piqaDataset_V2.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = piqaDataset_V2.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['train', 'validation']: + ms_dataset = MsDataset.load(path, split=split) + dataset_list = [] + for item in ms_dataset: + label = item['label'] + dataset_list.append({ + 'goal': + item['goal'], + 'sol1': + item['sol1'], + 'sol2': + item['sol2'], + 'answer': + 'NULL' if label < 0 else 'AB'[label] + }) + dataset[split] = Dataset.from_list(dataset_list) + else: + train_dataset = PIQADatasetV2.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = PIQADatasetV2.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + dataset = DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) + return dataset @LOAD_DATASET.register_module() -class piqaDataset_V3(BaseDataset): +class PIQADatasetV3(BaseDataset): @staticmethod def load_single(path, data_filename, label_filename): @@ -94,15 +136,43 @@ class piqaDataset_V3(BaseDataset): else: i['sol1'] = i['sol1'][0].lower() + i['sol1'][1:] i['sol2'] = i['sol2'][0].lower() + i['sol2'][1:] - + del i['id'] dataset.append(i) return Dataset.from_list(dataset) @staticmethod def load(path): - train_dataset = piqaDataset_V3.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = piqaDataset_V3.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['train', 'validation']: + ms_dataset = MsDataset.load(path, split=split) + dataset_list = [] + for item in ms_dataset: + label = item['label'] + goal = item['goal'][0].upper() + item['goal'][1:] + if goal.endswith('?') or goal.endswith('.'): + sol1 = item['sol1'][0].upper() + item['sol1'][1:] + sol2 = item['sol2'][0].upper() + item['sol2'][1:] + else: + sol1 = item['sol1'][0].lower() + item['sol1'][1:] + sol2 = item['sol2'][0].lower() + item['sol2'][1:] + dataset_list.append({ + 'goal': goal, + 'sol1': sol1, + 'sol2': sol2, + 'label': label + }) + dataset[split] = Dataset.from_list(dataset_list) + else: + train_dataset = PIQADatasetV3.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = PIQADatasetV3.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + dataset = DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) + return dataset diff --git a/opencompass/datasets/py150.py b/opencompass/datasets/py150.py index d478357d..2e977396 100644 --- a/opencompass/datasets/py150.py +++ b/opencompass/datasets/py150.py @@ -4,6 +4,7 @@ import re from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -30,6 +31,7 @@ class Py150Dataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) lines = open(path, 'r').readlines() rows = [] for line in lines: diff --git a/opencompass/datasets/qasper.py b/opencompass/datasets/qasper.py index b860c612..9e26d227 100644 --- a/opencompass/datasets/qasper.py +++ b/opencompass/datasets/qasper.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class QASPERDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) import json import os dataset_dict = DatasetDict() diff --git a/opencompass/datasets/qaspercut.py b/opencompass/datasets/qaspercut.py index d892dea9..5ab4ff9b 100644 --- a/opencompass/datasets/qaspercut.py +++ b/opencompass/datasets/qaspercut.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class QASPERCUTDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) import json import os dataset_dict = DatasetDict() diff --git a/opencompass/datasets/race.py b/opencompass/datasets/race.py index f5be41cf..6940c11c 100644 --- a/opencompass/datasets/race.py +++ b/opencompass/datasets/race.py @@ -1,9 +1,11 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,13 +15,18 @@ class RaceDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path) dataset = {} - for split in ['validation', 'test']: - jsonl_path = os.path.join(path, split, f'{name}.jsonl') - dataset_list = [] - with open(jsonl_path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['validation', 'test']: + # 从 ModelScope 加载数据 + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + + dataset_list = [] + for line in ms_dataset: dataset_list.append({ 'article': line['article'], 'question': line['question'], @@ -29,5 +36,22 @@ class RaceDataset(BaseDataset): 'D': line['options'][3], 'answer': line['answer'], }) - dataset[split] = Dataset.from_list(dataset_list) + dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['validation', 'test']: + jsonl_path = os.path.join(path, split, f'{name}.jsonl') + dataset_list = [] + with open(jsonl_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + dataset_list.append({ + 'article': line['article'], + 'question': line['question'], + 'A': line['options'][0], + 'B': line['options'][1], + 'C': line['options'][2], + 'D': line['options'][3], + 'answer': line['answer'], + }) + dataset[split] = Dataset.from_list(dataset_list) return DatasetDict(dataset) diff --git a/opencompass/datasets/realtoxicprompts.py b/opencompass/datasets/realtoxicprompts.py index 9e27834f..6d5929da 100644 --- a/opencompass/datasets/realtoxicprompts.py +++ b/opencompass/datasets/realtoxicprompts.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -20,7 +21,9 @@ class RealToxicPromptsDataset(BaseDataset): 'cannot track it online or use offline mode, ' 'please set local file path directly.') else: - dataset = Dataset.from_file(kwargs.pop('path')) + path = kwargs.pop('path') + path = get_data_path(path, local_mode=True) + dataset = Dataset.from_file(path) dataset = DatasetDict(train=dataset) def preprocess(example): diff --git a/opencompass/datasets/record.py b/opencompass/datasets/record.py index 7f90c535..f44276cc 100644 --- a/opencompass/datasets/record.py +++ b/opencompass/datasets/record.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class ReCoRDDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r', errors='ignore') as in_f: rows = [] for i, line in enumerate(in_f): @@ -43,10 +45,11 @@ class ReCoRDDataset(BaseDataset): return dataset -class ReCoRDDataset_V2(BaseDataset): +class ReCoRDDatasetV2(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r', errors='ignore') as in_f: rows = [] for i, line in enumerate(in_f): diff --git a/opencompass/datasets/rolebench.py b/opencompass/datasets/rolebench.py index 22e77220..42b505c1 100644 --- a/opencompass/datasets/rolebench.py +++ b/opencompass/datasets/rolebench.py @@ -4,6 +4,7 @@ import os from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -25,6 +26,7 @@ class RoleBenchBaseDataset(BaseDataset): @staticmethod def load_desc(path): + path = get_data_path(path, local_mode=True) with open(path, 'r', encoding='utf-8') as f: desc_list = json.load(f) return desc_list @@ -50,6 +52,7 @@ class InstructionGeneralizationEnglishDataset(RoleBenchBaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) desc_list = RoleBenchBaseDataset.load_desc( os.path.join(path, 'profiles-eng/desc.json')) path = os.path.join(path, 'rolebench-eng/instruction-generalization') @@ -63,6 +66,7 @@ class RoleGeneralizationEnglishDataset(RoleBenchBaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) desc_list = RoleBenchBaseDataset.load_desc( os.path.join(path, 'profiles-eng/desc.json')) path = os.path.join(path, 'rolebench-eng/role-generalization') diff --git a/opencompass/datasets/safety.py b/opencompass/datasets/safety.py index 1cd9550e..6608fd32 100644 --- a/opencompass/datasets/safety.py +++ b/opencompass/datasets/safety.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class SafetyDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() data_list = list() diff --git a/opencompass/datasets/scibench.py b/opencompass/datasets/scibench.py index 2403b632..d7a9e875 100644 --- a/opencompass/datasets/scibench.py +++ b/opencompass/datasets/scibench.py @@ -5,6 +5,7 @@ import re from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -14,6 +15,7 @@ class ScibenchDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) train_data = [] filename = osp.join(path, f'{name}.json') diff --git a/opencompass/datasets/siqa.py b/opencompass/datasets/siqa.py index 8df45bda..152e8ce1 100644 --- a/opencompass/datasets/siqa.py +++ b/opencompass/datasets/siqa.py @@ -1,9 +1,11 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -31,11 +33,28 @@ class siqaDataset(BaseDataset): @staticmethod def load(path): - train_dataset = siqaDataset.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = siqaDataset.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['train', 'validation']: + data_list = [] + ms_dataset = MsDataset.load(path, split=split) + for item in ms_dataset: + row = item + row['label'] = int(item['label']) + data_list.append(row) + dataset[split] = Dataset.from_list(data_list) + return dataset + else: + train_dataset = siqaDataset.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = siqaDataset.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) @LOAD_DATASET.register_module() @@ -73,15 +92,44 @@ class siqaDataset_V2(BaseDataset): @staticmethod def load(path): - train_dataset = siqaDataset_V2.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['train', 'validation']: + data_list = [] + ms_dataset = MsDataset.load(path, split=split) + for item in ms_dataset: + row = item + label = item['label'] + # some preprocessing + row['all_labels'] = { + 'candidates': [ + [f'A. {item["answerA"]}', 'A', item['answerA']], + [f'B. {item["answerB"]}', 'B', item['answerB']], + [f'C. {item["answerC"]}', 'C', item['answerC']], + ], + 'label': + int(label) - 1 + } + row['label'] = ' ABC'[int(label)] + + data_list.append(row) + dataset[split] = Dataset.from_list(data_list) + else: + train_dataset = siqaDataset_V2.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + dataset = DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) + return dataset @LOAD_DATASET.register_module() -class siqaDataset_V3(BaseDataset): +class SiqaDatasetV3(BaseDataset): """Disconnect from HuggingFace version of HFDataset.""" @staticmethod @@ -106,9 +154,32 @@ class siqaDataset_V3(BaseDataset): @staticmethod def load(path): - train_dataset = siqaDataset_V3.load_single(path, 'train.jsonl', - 'train-labels.lst') - val_dataset = siqaDataset_V3.load_single(path, 'dev.jsonl', - 'dev-labels.lst') - - return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['train', 'validation']: + data_list = [] + ms_dataset = MsDataset.load(path, split=split) + for item in ms_dataset: + row = item + label = item['label'] + # some preprocessing + row['A'] = item['answerA'] + row['B'] = item['answerB'] + row['C'] = item['answerC'] + row['answer'] = 'ABC'[int(label) - 1] + del row['answerA'], row['answerB'], row['answerC'], row[ + 'label'] + data_list.append(row) + dataset[split] = Dataset.from_list(data_list) + else: + train_dataset = SiqaDatasetV3.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = SiqaDatasetV3.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + dataset = DatasetDict({ + 'train': train_dataset, + 'validation': val_dataset + }) + return dataset diff --git a/opencompass/datasets/squad20.py b/opencompass/datasets/squad20.py index a5f81966..3e2f5a2c 100644 --- a/opencompass/datasets/squad20.py +++ b/opencompass/datasets/squad20.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset @@ -12,6 +13,7 @@ class SQuAD20Dataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) with open(path, 'r') as f: data = json.load(f) data = data['data'] diff --git a/opencompass/datasets/storycloze.py b/opencompass/datasets/storycloze.py index a0e2ec6b..517d6fd7 100644 --- a/opencompass/datasets/storycloze.py +++ b/opencompass/datasets/storycloze.py @@ -1,44 +1,62 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class storyclozeDataset(BaseDataset): +class StoryClozeDataset(BaseDataset): @staticmethod def load(path, lang): + path = get_data_path(path) dataset_list = [] for split in ['train', 'eval']: - split_path = os.path.join(path, f'{lang}_{split}.jsonl') - with open(split_path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, + subset_name=lang, + split=split) + for line in ms_dataset: line['context'] = ' '.join([ line['input_sentence_1'], line['input_sentence_2'], line['input_sentence_3'], line['input_sentence_4'] ]) dataset_list.append(line) + else: + split_path = os.path.join(path, f'{lang}_{split}.jsonl') + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['context'] = ' '.join([ + line['input_sentence_1'], line['input_sentence_2'], + line['input_sentence_3'], line['input_sentence_4'] + ]) + dataset_list.append(line) dataset_list = Dataset.from_list(dataset_list) return DatasetDict({'test': dataset_list}) @LOAD_DATASET.register_module() -class storyclozeDataset_V2(BaseDataset): +class StoryClozeDatasetV2(BaseDataset): @staticmethod def load(path, lang): + path = get_data_path(path) dataset_list = [] for split in ['train', 'eval']: - split_path = os.path.join(path, f'{lang}_{split}.jsonl') - with open(split_path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, + subset_name=lang, + split=split) + for line in ms_dataset: line['context'] = ' '.join([ line['input_sentence_1'], line['input_sentence_2'], line['input_sentence_3'], line['input_sentence_4'] @@ -46,5 +64,17 @@ class storyclozeDataset_V2(BaseDataset): line['answer_right_ending'] = ' AB'[ line['answer_right_ending']] dataset_list.append(line) + else: + split_path = os.path.join(path, f'{lang}_{split}.jsonl') + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['context'] = ' '.join([ + line['input_sentence_1'], line['input_sentence_2'], + line['input_sentence_3'], line['input_sentence_4'] + ]) + line['answer_right_ending'] = ' AB'[ + line['answer_right_ending']] + dataset_list.append(line) dataset_list = Dataset.from_list(dataset_list) return dataset_list diff --git a/opencompass/datasets/strategyqa.py b/opencompass/datasets/strategyqa.py index 5e0117f3..47f70cf3 100644 --- a/opencompass/datasets/strategyqa.py +++ b/opencompass/datasets/strategyqa.py @@ -1,9 +1,11 @@ import json import re +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -28,6 +30,15 @@ class StrategyQADataset(BaseDataset): @staticmethod def load(path): - with open(path, 'r', encoding='utf-8') as f: - dataset = json.load(f) - return Dataset.from_list(dataset) + path = get_data_path(path) + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load('opencompass/strategy_qa', + split='train', + trust_remote_code=True) + else: + with open(path, 'r', encoding='utf-8') as f: + dataset = json.load(f) + dataset = Dataset.from_list(dataset) + return dataset diff --git a/opencompass/datasets/summedits.py b/opencompass/datasets/summedits.py index 37927726..fdc3c653 100644 --- a/opencompass/datasets/summedits.py +++ b/opencompass/datasets/summedits.py @@ -1,8 +1,10 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,10 +14,19 @@ class SummeditsDataset_V2(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) dataset = [] - with open(path, 'r') as f: - for line in f: - line = json.loads(line) - line['label'] = 'BA'[line['label']] - dataset.append(line) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='train') + for line in ms_dataset: + row = line + row['label'] = 'BA'[line['label']] + dataset.append(row) + else: + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + line['label'] = 'BA'[line['label']] + dataset.append(line) return Dataset.from_list(dataset) diff --git a/opencompass/datasets/summscreen.py b/opencompass/datasets/summscreen.py index fb3707e2..9023847b 100644 --- a/opencompass/datasets/summscreen.py +++ b/opencompass/datasets/summscreen.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class SummScreenDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) import json import os dataset_dict = DatasetDict() diff --git a/opencompass/datasets/svamp.py b/opencompass/datasets/svamp.py index 801c26db..082a8ba7 100644 --- a/opencompass/datasets/svamp.py +++ b/opencompass/datasets/svamp.py @@ -3,6 +3,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,6 +13,7 @@ class SVAMPDataset(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py index ac5952ea..b861d753 100644 --- a/opencompass/datasets/tabmwp.py +++ b/opencompass/datasets/tabmwp.py @@ -9,6 +9,7 @@ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator.icl_hf_evaluator import AccEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -208,6 +209,7 @@ class TabMWPDataset(BaseDataset): # https://github.com/lupantech/PromptPG/tree/main @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() for split in ['dev', 'test', 'train']: raw_data = [] diff --git a/opencompass/datasets/taco.py b/opencompass/datasets/taco.py index fc4e89be..e48cd260 100644 --- a/opencompass/datasets/taco.py +++ b/opencompass/datasets/taco.py @@ -27,6 +27,7 @@ except ImportError: from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -38,6 +39,7 @@ class TACODataset(BaseDataset): @staticmethod def load(path: str, num_repeats: int = 1, difficulty='ALL'): + path = get_data_path(path, local_mode=True) dataset = load_from_disk(path) new_dataset = DatasetDict() # add new column "starter" in the prompt diff --git a/opencompass/datasets/teval/__init__.py b/opencompass/datasets/teval/__init__.py index fc498d8a..631a9ae5 100644 --- a/opencompass/datasets/teval/__init__.py +++ b/opencompass/datasets/teval/__init__.py @@ -6,6 +6,7 @@ import mmengine from datasets import Dataset, DatasetDict from opencompass.registry import TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -16,6 +17,7 @@ class TEvalDataset(BaseDataset): super().__init__(reader_cfg=reader_cfg, **kwargs) def load(self, path: str, name: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() data = mmengine.load(osp.join(path, f'{name}.json')) diff --git a/opencompass/datasets/tnews.py b/opencompass/datasets/tnews.py index 606ea40c..bfc1e5eb 100644 --- a/opencompass/datasets/tnews.py +++ b/opencompass/datasets/tnews.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -30,7 +31,9 @@ class TNewsDataset(BaseDataset): 'news_story': '故事类别新闻', 'news_stock': '股票市场类别新闻', } - + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -44,7 +47,7 @@ class TNewsDataset(BaseDataset): @LOAD_DATASET.register_module() -class TNewsDataset_V2(BaseDataset): +class TNewsDatasetV2(BaseDataset): @staticmethod def load(path): @@ -65,7 +68,7 @@ class TNewsDataset_V2(BaseDataset): 'news_story': 'N', 'news_stock': 'O', } - + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: diff --git a/opencompass/datasets/triviaqa.py b/opencompass/datasets/triviaqa.py index 0c799071..01d5592b 100644 --- a/opencompass/datasets/triviaqa.py +++ b/opencompass/datasets/triviaqa.py @@ -1,11 +1,13 @@ import csv import json import os.path as osp +from os import environ from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset @@ -16,46 +18,85 @@ class TriviaQADataset(BaseDataset): @staticmethod def load(path: str): - dataset = DatasetDict() - for split in ['dev', 'test']: - filename = osp.join(path, f'trivia-{split}.qa.csv') - with open(filename, 'r', encoding='utf-8') as f: - reader = csv.reader(f, delimiter='\t') + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + for split in ['dev', 'test']: + ms_dataset = MsDataset.load(path, + subset_name='v1', + split=split) raw_data = [] - for row in reader: - assert len(row) == 2 - question = row[0] - answers = eval(row[1]) + for row in ms_dataset: + question = row['question'] + answers = eval(row['answer']) if split == 'test': answers = answers[0] raw_data.append({'question': question, 'answer': answers}) dataset[split] = Dataset.from_list(raw_data) + else: + dataset = DatasetDict() + for split in ['dev', 'test']: + filename = osp.join(path, f'trivia-{split}.qa.csv') + with open(filename, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='\t') + raw_data = [] + for row in reader: + assert len(row) == 2 + question = row[0] + answers = eval(row[1]) + if split == 'test': + answers = answers[0] + raw_data.append({ + 'question': question, + 'answer': answers + }) + dataset[split] = Dataset.from_list(raw_data) return dataset @LOAD_DATASET.register_module() -class TriviaQADataset_V2(BaseDataset): +class TriviaQADatasetV2(BaseDataset): @staticmethod def load(path: str): - dataset = DatasetDict() - for split in ['validation', 'train']: - filename = osp.join(path, f'triviaqa-{split}.jsonl') + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = DatasetDict() + dataset['train'] = MsDataset.load(path, + subset_name='v2', + split='train') + # validation + ms_dataset = MsDataset.load(path, + subset_name='v2', + split='validation') raw_data = [] - with open(filename, 'r', encoding='utf-8') as f: - for doc in f: - doc = json.loads(doc) - raw_data.append(doc) - dataset[split] = Dataset.from_list(raw_data) + for row in ms_dataset: + question = row['question'] + answers = eval(row['answer']) + raw_data.append({'question': question, 'answer': answers}) + dataset['validation'] = Dataset.from_list(raw_data) + else: + dataset = DatasetDict() + for split in ['validation', 'train']: + filename = osp.join(path, f'triviaqa-{split}.jsonl') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + for doc in f: + doc = json.loads(doc) + raw_data.append(doc) + dataset[split] = Dataset.from_list(raw_data) return dataset @LOAD_DATASET.register_module() -class TriviaQADataset_V3(BaseDataset): +class TriviaQADatasetV3(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path) data_list = [] with open(path, 'r', encoding='utf-8') as f: for doc in f: diff --git a/opencompass/datasets/triviaqarc.py b/opencompass/datasets/triviaqarc.py index ccdf69d8..6f0389ce 100644 --- a/opencompass/datasets/triviaqarc.py +++ b/opencompass/datasets/triviaqarc.py @@ -1,6 +1,7 @@ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -10,6 +11,7 @@ class TriviaQArcDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) import json import os dataset_dict = DatasetDict() diff --git a/opencompass/datasets/tydiqa.py b/opencompass/datasets/tydiqa.py index eebbab29..c27e738d 100644 --- a/opencompass/datasets/tydiqa.py +++ b/opencompass/datasets/tydiqa.py @@ -2,10 +2,12 @@ import json import os import re from collections import Counter +from os import environ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.utils import get_data_path from opencompass.utils.text_postprocessors import general_postprocess from .base import BaseDataset @@ -15,14 +17,25 @@ class TydiQADataset(BaseDataset): @staticmethod def load(path, lang): - path = os.path.join(path, 'dev', f'{lang}-dev.jsonl') - dataset_list = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, subset_name=lang, split='dev') + dataset_list = [] + for line in ms_dataset: + row = line answer = list(set([i['text'] for i in line['answers']])) - line['answer'] = answer - dataset_list.append(line) + row['answer'] = answer + dataset_list.append(row) + else: + path = os.path.join(path, 'dev', f'{lang}-dev.jsonl') + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + answer = list(set([i['text'] for i in line['answers']])) + line['answer'] = answer + dataset_list.append(line) return Dataset.from_list(dataset_list) diff --git a/opencompass/datasets/wic.py b/opencompass/datasets/wic.py index 7b36161b..068a482b 100644 --- a/opencompass/datasets/wic.py +++ b/opencompass/datasets/wic.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -13,6 +14,9 @@ class WiCDataset(BaseDataset): @staticmethod def load(**kwargs): + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -28,10 +32,11 @@ class WiCDataset(BaseDataset): @LOAD_DATASET.register_module() -class WiCDataset_V2(BaseDataset): +class WiCDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) dataset = [] with open(path, 'r') as f: for line in f: diff --git a/opencompass/datasets/wikibench.py b/opencompass/datasets/wikibench.py index ecebf5f0..9e0e5743 100644 --- a/opencompass/datasets/wikibench.py +++ b/opencompass/datasets/wikibench.py @@ -4,6 +4,7 @@ import json from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -21,6 +22,7 @@ class WikiBenchDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC'] diff --git a/opencompass/datasets/winograd.py b/opencompass/datasets/winograd.py index 0a3337c7..dc3f0b09 100644 --- a/opencompass/datasets/winograd.py +++ b/opencompass/datasets/winograd.py @@ -6,7 +6,7 @@ from .base import BaseDataset @LOAD_DATASET.register_module() -class winogradDataset(BaseDataset): +class WinogradDataset(BaseDataset): @staticmethod def load(**kwargs): diff --git a/opencompass/datasets/winogrande.py b/opencompass/datasets/winogrande.py index 8ea8d27b..57097e3f 100644 --- a/opencompass/datasets/winogrande.py +++ b/opencompass/datasets/winogrande.py @@ -1,24 +1,30 @@ import json import os +from os import environ from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @LOAD_DATASET.register_module() -class winograndeDataset(BaseDataset): - """Disconnect from Huggingface, winograndeDataset.""" +class WinograndeDataset(BaseDataset): + """Disconnect from Huggingface, WinograndeDataset.""" @staticmethod def load(path): - path = os.path.join(path, 'dev.jsonl') - dataset_list = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, + subset_name='winogrande_xs', + split='validation', + trust_remote_code=True) + dataset_list = [] + for line in ms_dataset: prompt = line['sentence'] continue_prompt = prompt.split('_')[1] data_item = { @@ -31,21 +37,44 @@ class winograndeDataset(BaseDataset): 'cont': continue_prompt, } dataset_list.append(data_item) - dataset_list = Dataset.from_list(dataset_list) + dataset_list = Dataset.from_list(dataset_list) + else: + path = os.path.join(path, 'dev.jsonl') + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + prompt = line['sentence'] + continue_prompt = prompt.split('_')[1] + data_item = { + 'prompt': prompt, + 'only_option1': line['option1'], + 'only_option2': line['option2'], + 'opt1': prompt.replace('_', line['option1']), + 'opt2': prompt.replace('_', line['option2']), + 'answer': line['answer'], + 'cont': continue_prompt, + } + dataset_list.append(data_item) + dataset_list = Dataset.from_list(dataset_list) return dataset_list @LOAD_DATASET.register_module() -class winograndeDataset_V2(BaseDataset): - """Disconnect from Huggingface, winograndeDataset_V2.""" +class WinograndeDatasetV2(BaseDataset): + """Disconnect from Huggingface, WinograndeDatasetV2.""" @staticmethod def load(path): - path = os.path.join(path, 'dev.jsonl') - dataset_list = [] - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = json.loads(line) + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, + subset_name='winogrande_xs', + split='validation', + trust_remote_code=True) + dataset_list = [] + for line in ms_dataset: prompt = line['sentence'] continue_prompt = prompt.split('_')[1] answer = line['answer'] @@ -60,21 +89,11 @@ class winograndeDataset_V2(BaseDataset): 'cont': continue_prompt, } dataset_list.append(data_item) - dataset_list = Dataset.from_list(dataset_list) - return dataset_list - - -@LOAD_DATASET.register_module() -class winograndeDataset_V3(BaseDataset): - """Disconnect from Huggingface, winograndeDataset_V2.""" - - @staticmethod - def load(path): - dataset_dict = DatasetDict() - for split in ['train_xs', 'dev']: - filename = os.path.join(path, f'{split}.jsonl') + dataset_list = Dataset.from_list(dataset_list) + else: + path = os.path.join(path, 'dev.jsonl') dataset_list = [] - with open(filename, 'r', encoding='utf-8') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) prompt = line['sentence'] @@ -91,5 +110,65 @@ class winograndeDataset_V3(BaseDataset): 'cont': continue_prompt, } dataset_list.append(data_item) - dataset_dict[split] = Dataset.from_list(dataset_list) + dataset_list = Dataset.from_list(dataset_list) + return dataset_list + + +@LOAD_DATASET.register_module() +class WinograndeDatasetV3(BaseDataset): + """Disconnect from Huggingface, WinograndeDatasetV2.""" + + @staticmethod + def load(path): + path = get_data_path(path) + dataset_dict = DatasetDict() + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['train', 'validation']: + ms_dataset = MsDataset.load(path, + subset_name='winogrande_xs', + split=split, + trust_remote_code=True) + dataset_list = [] + for line in ms_dataset: + prompt = line['sentence'] + continue_prompt = prompt.split('_')[1] + answer = line['answer'] + answer = ' AB'[int(answer)] if answer != '' else 'NULL' + data_item = { + 'prompt': prompt, + 'only_option1': line['option1'], + 'only_option2': line['option2'], + 'opt1': prompt.replace('_', line['option1']), + 'opt2': prompt.replace('_', line['option2']), + 'answer': answer, + 'cont': continue_prompt, + } + dataset_list.append(data_item) + if split == 'train': + dataset_dict['train_xs'] = Dataset.from_list(dataset_list) + elif split == 'validation': + dataset_dict['dev'] = Dataset.from_list(dataset_list) + else: + for split in ['train_xs', 'dev']: + filename = os.path.join(path, f'{split}.jsonl') + dataset_list = [] + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + prompt = line['sentence'] + continue_prompt = prompt.split('_')[1] + answer = line['answer'] + answer = ' AB'[int(answer)] if answer != '' else 'NULL' + data_item = { + 'prompt': prompt, + 'only_option1': line['option1'], + 'only_option2': line['option2'], + 'opt1': prompt.replace('_', line['option1']), + 'opt2': prompt.replace('_', line['option2']), + 'answer': answer, + 'cont': continue_prompt, + } + dataset_list.append(data_item) + dataset_dict[split] = Dataset.from_list(dataset_list) return dataset_dict diff --git a/opencompass/datasets/wsc.py b/opencompass/datasets/wsc.py index 31f54948..c4d406a4 100644 --- a/opencompass/datasets/wsc.py +++ b/opencompass/datasets/wsc.py @@ -3,6 +3,7 @@ import json from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,7 +13,9 @@ class WSCDataset(BaseDataset): @staticmethod def load(**kwargs): - + if 'data_files' in kwargs: + kwargs['data_files'] = get_data_path(kwargs['data_files'], + local_mode=True) dataset = load_dataset(**kwargs) def preprocess(example): @@ -37,10 +40,11 @@ class WSCDataset(BaseDataset): @LOAD_DATASET.register_module() -class WSCDataset_V2(BaseDataset): +class WSCDatasetV2(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r') as f: for line in f: @@ -59,10 +63,11 @@ class WSCDataset_V2(BaseDataset): @LOAD_DATASET.register_module() -class WSCDataset_V3(BaseDataset): +class WSCDatasetV3(BaseDataset): @staticmethod def load(path): + path = get_data_path(path, local_mode=True) data = [] with open(path, 'r') as f: for line in f: diff --git a/opencompass/datasets/xiezhi.py b/opencompass/datasets/xiezhi.py index 34104be4..878e34ce 100644 --- a/opencompass/datasets/xiezhi.py +++ b/opencompass/datasets/xiezhi.py @@ -6,6 +6,7 @@ from datasets import Dataset, DatasetDict from tqdm import trange from opencompass.openicl.icl_retriever import BaseRetriever +from opencompass.utils import get_data_path from .base import BaseDataset @@ -14,6 +15,7 @@ class XiezhiDataset(BaseDataset): @staticmethod def load(path: str, name: str): + path = get_data_path(path, local_mode=True) dataset = DatasetDict() filename = osp.join(path, name, 'xiezhi.v1.json') if 'chn' in name: diff --git a/opencompass/datasets/xsum.py b/opencompass/datasets/xsum.py index 4ece9132..8f9c1a10 100644 --- a/opencompass/datasets/xsum.py +++ b/opencompass/datasets/xsum.py @@ -1,8 +1,10 @@ import json +from os import environ from datasets import Dataset from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path from .base import BaseDataset @@ -12,22 +14,38 @@ class XsumDataset(BaseDataset): @staticmethod def load(path: str): - with open(path, 'r', errors='ignore') as in_f: + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='validation') rows = [] - for i, line in enumerate(in_f): + for i, line in enumerate(ms_dataset): if i == 1000: break - sample = json.loads(line.strip()) - dialogue = sample['dialogue'] - summary = sample['summary'] - if isinstance(dialogue, float) or isinstance(summary, float): + dialogue = line['document'] + summary = line['summary'] + if not dialogue or not summary: continue rows.append({'dialogue': dialogue, 'summary': summary}) - dataset = Dataset.from_dict({ - 'dialogue': [row['dialogue'] for row in rows], - 'summary': [row['summary'] for row in rows] - }) - return dataset + dataset = Dataset.from_list(rows) + else: + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for i, line in enumerate(in_f): + if i == 1000: + break + sample = json.loads(line.strip()) + dialogue = sample['dialogue'] + summary = sample['summary'] + if isinstance(dialogue, float) or isinstance( + summary, float): + continue + rows.append({'dialogue': dialogue, 'summary': summary}) + dataset = Dataset.from_dict({ + 'dialogue': [row['dialogue'] for row in rows], + 'summary': [row['summary'] for row in rows] + }) + return dataset @TEXT_POSTPROCESSORS.register_module('Xsum') diff --git a/opencompass/utils/__init__.py b/opencompass/utils/__init__.py index 60834ec2..466e6222 100644 --- a/opencompass/utils/__init__.py +++ b/opencompass/utils/__init__.py @@ -2,6 +2,7 @@ from .abbr import * # noqa from .auxiliary import * # noqa from .build import * # noqa from .collect_env import * # noqa +from .datasets import * # noqa from .dependency import * # noqa from .file import * # noqa from .fileio import * # noqa diff --git a/opencompass/utils/datasets.py b/opencompass/utils/datasets.py new file mode 100644 index 00000000..14ea7d74 --- /dev/null +++ b/opencompass/utils/datasets.py @@ -0,0 +1,301 @@ +import os + +DATASETS_MAPPING = { + # ADVGLUE Datasets + 'opencompass/advglue-dev': { + 'ms_id': None, + 'hf_id': None, + 'local': './data/adv_glue/dev_ann.json', + }, + # AGIEval Datasets + 'opencompass/agieval': { + 'ms_id': 'opencompass/agieval', + 'hf_id': 'opencompass/agieval', + 'local': './data/AGIEval/data/v1/', + }, + + # ARC Datasets(Test) + 'opencompass/ai2_arc-test': { + 'ms_id': 'opencompass/ai2_arc', + 'hf_id': 'opencompass/ai2_arc', + 'local': './data/ARC/ARC-c/ARC-Challenge-Test.jsonl', + }, + 'opencompass/ai2_arc-dev': { + 'ms_id': 'opencompass/ai2_arc', + 'hf_id': 'opencompass/ai2_arc', + 'local': './data/ARC/ARC-c/ARC-Challenge-Dev.jsonl', + }, + 'opencompass/ai2_arc-easy-dev': { + 'ms_id': 'opencompass/ai2_arc', + 'hf_id': 'opencompass/ai2_arc', + 'local': './data/ARC/ARC-e/ARC-Easy-Dev.jsonl', + }, + # BBH + 'opencompass/bbh': { + 'ms_id': 'opencompass/bbh', + 'hf_id': 'opencompass/bbh', + 'local': './data/BBH/data', + }, + # C-Eval + 'opencompass/ceval-exam': { + 'ms_id': 'opencompass/ceval-exam', + 'hf_id': 'opencompass/ceval-exam', + 'local': './data/ceval/formal_ceval', + }, + # AFQMC + 'opencompass/afqmc-dev': { + 'ms_id': 'opencompass/afqmc', + 'hf_id': 'opencompass/afqmc', + 'local': './data/CLUE/AFQMC/dev.json', + }, + # CMNLI + 'opencompass/cmnli-dev': { + 'ms_id': 'opencompass/cmnli', + 'hf_id': 'opencompass/cmnli', + 'local': './data/CLUE/cmnli/cmnli_public/dev.json', + }, + # OCNLI + 'opencompass/OCNLI-dev': { + 'ms_id': 'opencompass/OCNLI', + 'hf_id': 'opencompass/OCNLI', + 'local': './data/CLUE/OCNLI/dev.json', + }, + # ChemBench + 'opencompass/ChemBench': { + 'ms_id': 'opencompass/ChemBench', + 'hf_id': 'opencompass/ChemBench', + 'local': './data/ChemBench/', + }, + # CMMLU + 'opencompass/cmmlu': { + 'ms_id': 'opencompass/cmmlu', + 'hf_id': 'opencompass/cmmlu', + 'local': './data/cmmlu/', + }, + # CommonsenseQA + 'opencompass/commonsense_qa': { + 'ms_id': 'opencompass/commonsense_qa', + 'hf_id': 'opencompass/commonsense_qa', + 'local': './data/commonsenseqa', + }, + # CMRC + 'opencompass/cmrc_dev': { + 'ms_id': 'opencompass/cmrc_dev', + 'hf_id': 'opencompass/cmrc_dev', + 'local': './data/CLUE/CMRC/dev.json' + }, + # DRCD_dev + 'opencompass/drcd_dev': { + 'ms_id': 'opencompass/drcd_dev', + 'hf_id': 'opencompass/drcd_dev', + 'local': './data/CLUE/DRCD/dev.json' + }, + # clozeTest_maxmin + 'opencompass/clozeTest_maxmin': { + 'ms_id': None, + 'hf_id': None, + 'local': './data/clozeTest-maxmin/python/clozeTest.json', + }, + # clozeTest_maxmin + 'opencompass/clozeTest_maxmin_answers': { + 'ms_id': None, + 'hf_id': None, + 'local': './data/clozeTest-maxmin/python/answers.txt', + }, + # Flores + 'opencompass/flores': { + 'ms_id': 'opencompass/flores', + 'hf_id': 'opencompass/flores', + 'local': './data/flores_first100', + }, + # MBPP + 'opencompass/mbpp': { + 'ms_id': 'opencompass/mbpp', + 'hf_id': 'opencompass/mbpp', + 'local': './data/mbpp/mbpp.jsonl', + }, + # 'opencompass/mbpp': { + # 'ms_id': 'opencompass/mbpp', + # 'hf_id': 'opencompass/mbpp', + # 'local': './data/mbpp/mbpp.jsonl', + # }, + 'opencompass/sanitized_mbpp': { + 'ms_id': 'opencompass/mbpp', + 'hf_id': 'opencompass/mbpp', + 'local': './data/mbpp/sanitized-mbpp.jsonl', + }, + # GSM + 'opencompass/gsm8k': { + 'ms_id': 'opencompass/gsm8k', + 'hf_id': 'opencompass/gsm8k', + 'local': './data/gsm8k/', + }, + # HellaSwag + 'opencompass/hellaswag': { + 'ms_id': 'opencompass/hellaswag', + 'hf_id': 'opencompass/hellaswag', + 'local': './data/hellaswag/hellaswag.jsonl', + }, + # HellaSwagICE + 'opencompass/hellaswag_ice': { + 'ms_id': 'opencompass/hellaswag', + 'hf_id': 'opencompass/hellaswag', + 'local': './data/hellaswag/', + }, + # HumanEval + 'opencompass/humaneval': { + 'ms_id': 'opencompass/humaneval', + 'hf_id': 'opencompass/humaneval', + 'local': './data/humaneval/human-eval-v2-20210705.jsonl', + }, + # HumanEvalCN + 'opencompass/humaneval_cn': { + 'ms_id': 'opencompass/humaneval', + 'hf_id': 'opencompass/humaneval', + 'local': './data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', + }, + # Lambada + 'opencompass/lambada': { + 'ms_id': 'opencompass/lambada', + 'hf_id': 'opencompass/lambada', + 'local': './data/lambada/test.jsonl', + }, + # LCSTS + 'opencompass/LCSTS': { + 'ms_id': 'opencompass/LCSTS', + 'hf_id': 'opencompass/LCSTS', + 'local': './data/LCSTS', + }, + # MATH + 'opencompass/math': { + 'ms_id': 'opencompass/math', + 'hf_id': 'opencompass/math', + 'local': './data/math/math.json', + }, + # MMLU + 'opencompass/mmlu': { + 'ms_id': 'opencompass/mmlu', + 'hf_id': 'opencompass/mmlu', + 'local': './data/mmlu/', + }, + # NQ + 'opencompass/natural_question': { + 'ms_id': 'opencompass/natural_question', + 'hf_id': 'opencompass/natural_question', + 'local': './data/nq/', + }, + # OpenBook QA-test + 'opencompass/openbookqa_test': { + 'ms_id': 'opencompass/openbookqa', + 'hf_id': 'opencompass/openbookqa', + 'local': './data/openbookqa/Main/test.jsonl', + }, + # OpenBook QA-fact + 'opencompass/openbookqa_fact': { + 'ms_id': 'opencompass/openbookqa', + 'hf_id': 'opencompass/openbookqa', + 'local': './data/openbookqa/Additional/test_complete.jsonl', + }, + # PIQA + 'opencompass/piqa': { + 'ms_id': 'opencompass/piqa', + 'hf_id': 'opencompass/piqa', + 'local': './data/piqa', + }, + # RACE + 'opencompass/race': { + 'ms_id': 'opencompass/race', + 'hf_id': 'opencompass/race', + 'local': './data/race', + }, + # SIQA + 'opencompass/siqa': { + 'ms_id': 'opencompass/siqa', + 'hf_id': 'opencompass/siqa', + 'local': './data/siqa', + }, + # XStoryCloze + 'opencompass/xstory_cloze': { + 'ms_id': 'opencompass/xstory_cloze', + 'hf_id': 'opencompass/xstory_cloze', + 'local': './data/xstory_cloze', + }, + # StrategyQA + 'opencompass/strategy_qa': { + 'ms_id': 'opencompass/strategy_qa', + 'hf_id': 'opencompass/strategy_qa', + 'local': './data/strategyqa/strategyQA_train.json', + }, + # SummEdits + 'opencompass/summedits': { + 'ms_id': 'opencompass/summedits', + 'hf_id': 'opencompass/summedits', + 'local': './data/summedits/summedits.jsonl', + }, + # TriviaQA + 'opencompass/trivia_qa': { + 'ms_id': 'opencompass/trivia_qa', + 'hf_id': 'opencompass/trivia_qa', + 'local': './data/triviaqa/', + }, + # TydiQA + 'opencompass/tydiqa': { + 'ms_id': 'opencompass/tydiqa', + 'hf_id': 'opencompass/tydiqa', + 'local': './data/tydiqa/', + }, + # Winogrande + 'opencompass/winogrande': { + 'ms_id': 'opencompass/winogrande', + 'hf_id': 'opencompass/winogrande', + 'local': './data/winogrande/', + }, + # XSum + 'opencompass/xsum': { + 'ms_id': 'opencompass/xsum', + 'hf_id': 'opencompass/xsum', + 'local': './data/Xsum/dev.jsonl', + } +} + + +def get_data_path(dataset_id: str, local_mode: bool = False): + """return dataset id when getting data from ModelScope repo, otherwise just + return local path as is. + + Args: + dataset_id (str): dataset id or data path + local_mode (bool): whether to use local path or + ModelScope/HuggignFace repo + """ + # update the path with CACHE_DIR + cache_dir = os.environ.get('COMPASS_DATA_CACHE', '') + + # For absolute path customized by the users + if dataset_id.startswith('/'): + return dataset_id + + # For relative path, with CACHE_DIR + if local_mode: + local_path = os.path.join(cache_dir, dataset_id) + assert os.path.exists(local_path), f'{local_path} does not exist!' + return local_path + + dataset_source = os.environ.get('DATASET_SOURCE', None) + if dataset_source == 'ModelScope': + ms_id = DATASETS_MAPPING[dataset_id]['ms_id'] + assert ms_id is not None, \ + f'{dataset_id} is not supported in ModelScope' + return ms_id + elif dataset_source == 'HF': + # TODO: HuggingFace mode is currently not supported! + hf_id = DATASETS_MAPPING[dataset_id]['hf_id'] + assert hf_id is not None, \ + f'{dataset_id} is not supported in HuggingFace' + return hf_id + else: + # for the local path + local_path = DATASETS_MAPPING[dataset_id]['local'] + local_path = os.path.join(cache_dir, local_path) + assert os.path.exists(local_path), f'{local_path} does not exist!' + return local_path diff --git a/tests/dataset/test_local_datasets.py b/tests/dataset/test_local_datasets.py new file mode 100644 index 00000000..1178749f --- /dev/null +++ b/tests/dataset/test_local_datasets.py @@ -0,0 +1,230 @@ +import random +import sys +import unittest +import warnings +from os import environ + +from datasets import Dataset, DatasetDict +from mmengine.config import read_base +from tqdm import tqdm + +from concurrent.futures import ThreadPoolExecutor, as_completed + +warnings.filterwarnings('ignore', category=DeprecationWarning) + + +def reload_datasets(): + modules_to_remove = [ + module_name for module_name in sys.modules + if module_name.startswith('configs.datasets') + ] + + for module_name in modules_to_remove: + del sys.modules[module_name] + + with read_base(): + from configs.datasets.ceval.ceval_gen import ceval_datasets + from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets + from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets + from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets + from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets + from configs.datasets.humaneval.humaneval_gen import humaneval_datasets + from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets + from configs.datasets.race.race_ppl import race_datasets + from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets + + from configs.datasets.mmlu.mmlu_gen import mmlu_datasets + from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets + from configs.datasets.bbh.bbh_gen import bbh_datasets + from configs.datasets.Xsum.Xsum_gen import Xsum_datasets + from configs.datasets.winogrande.winogrande_gen import winogrande_datasets + from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets + from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets + from configs.datasets.obqa.obqa_gen import obqa_datasets + from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets + from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets + # from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets + from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets + from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets + from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets + from configs.datasets.storycloze.storycloze_gen import storycloze_datasets + from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets + from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets + + from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets + from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets + from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets + from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets + from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets + from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets + from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets + from configs.datasets.nq.nq_gen import nq_datasets + from configs.datasets.lcsts.lcsts_gen import lcsts_datasets + from configs.datasets.math.math_gen import math_datasets + from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets + from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets + from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets + from configs.datasets.lambada.lambada_gen import lambada_datasets + from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets + from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets + from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets + from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets + from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets + from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets + + from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets + from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets + from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets + + from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets + from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets + from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets + from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets + from configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen import ocnli_fc_datasets + + return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +def load_datasets_conf(source): + environ['DATASET_SOURCE'] = source + datasets_conf = reload_datasets() + return datasets_conf + + +def load_datasets(source, conf): + environ['DATASET_SOURCE'] = source + if 'lang' in conf: + dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) + return dataset + if 'setting_name' in conf: + dataset = conf['type'].load(path=conf['path'], + name=conf['name'], + setting_name=conf['setting_name']) + return dataset + if 'name' in conf: + dataset = conf['type'].load(path=conf['path'], name=conf['name']) + return dataset + + if 'local_mode' in conf: + dataset = conf['type'].load(path=conf['path'], local_mode=conf['local_mode']) + return dataset + try: + dataset = conf['type'].load(path=conf['path']) + except Exception: + dataset = conf['type'].load(**conf) + return dataset + + +def clean_string(value): + """Helper function to clean and normalize string data. + + It strips leading and trailing whitespace and replaces multiple whitespace + characters with a single space. + """ + if isinstance(value, str): + return ' '.join(value.split()) + return value + + +class TestingLocalDatasets(unittest.TestCase): + + def test_datasets(self): + # 加载 ModelScope 和 Local 数据集配置 + # ms_datasets_conf = load_datasets_conf('ModelScope') + local_datasets_conf = load_datasets_conf('Local') + + # 初始化成功和失败的数据集列表 + successful_comparisons = [] + failed_comparisons = [] + + def compare_datasets(local_conf): + # local_dataset = load_datasets(local_conf) + local_dataset = load_datasets('Local', local_conf) + # modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" + local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" + # # 断言类型一致 + # assert ms_conf['type'] == local_conf['type'], "Data types do not match" + # print(modelscope_path_name, local_path_name) + try: + # ms_dataset = load_datasets('ModelScope', ms_conf) + local_dataset = load_datasets('Local', local_conf) + # _check_data(ms_dataset, local_dataset, sample_size=sample_size) + return 'success', f'{local_path_name}' + except Exception as exception: + # print(exception) + return 'failure', f'can\'t load {local_path_name}' + + with ThreadPoolExecutor(16) as executor: + futures = { + executor.submit(compare_datasets, local_conf): local_conf + for local_conf in local_datasets_conf + } + + for future in tqdm(as_completed(futures), total=len(futures)): + result, message = future.result() + if result == 'success': + successful_comparisons.append(message) + else: + failed_comparisons.append(message) + + # 输出测试总结 + total_datasets = len(local_datasets_conf) + print(f"All {total_datasets} datasets") + print(f"OK {len(successful_comparisons)} datasets") + for success in successful_comparisons: + print(f" {success}") + print(f"Fail {len(failed_comparisons)} datasets") + for failure in failed_comparisons: + print(f" {failure}") + + +def _check_data(ms_dataset: Dataset | DatasetDict, + oc_dataset: Dataset | DatasetDict, + sample_size): + assert type(ms_dataset) == type( + oc_dataset + ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' + + # match DatasetDict + if isinstance(oc_dataset, DatasetDict): + assert ms_dataset.keys() == oc_dataset.keys( + ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' + + for key in ms_dataset.keys(): + _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) + + elif isinstance(oc_dataset, Dataset): + # match by cols + assert set(ms_dataset.column_names) == set( + oc_dataset.column_names + ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' + + # Check that the number of rows is the same + assert len(ms_dataset) == len( + oc_dataset + ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' + + # Randomly sample indices + sample_indices = random.sample(range(len(ms_dataset)), + min(sample_size, len(ms_dataset))) + + for i, idx in enumerate(sample_indices): + for col in ms_dataset.column_names: + ms_value = clean_string(str(ms_dataset[col][idx])) + oc_value = clean_string(str(oc_dataset[col][idx])) + try: + assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" + except AssertionError as e: + print(f"Assertion failed for column '{col}', index {idx}") + print(f"ms_data: {ms_dataset[idx]}") + print(f'oc_data: {oc_dataset[idx]}') + print(f'ms_value: {ms_value} ({type(ms_value)})') + print(f'oc_value: {oc_value} ({type(oc_value)})') + raise e + else: + raise ValueError(f'Datasets type not supported {type(ms_dataset)}') + + +if __name__ == '__main__': + sample_size = 100 + unittest.main() diff --git a/tests/dataset/test_ms_datasets.py b/tests/dataset/test_ms_datasets.py new file mode 100644 index 00000000..41b106b9 --- /dev/null +++ b/tests/dataset/test_ms_datasets.py @@ -0,0 +1,223 @@ +import random +import sys +import unittest +import warnings +from os import environ + +from datasets import Dataset, DatasetDict +from mmengine.config import read_base +from tqdm import tqdm + +from concurrent.futures import ThreadPoolExecutor, as_completed + +warnings.filterwarnings('ignore', category=DeprecationWarning) + + +def reload_datasets(): + modules_to_remove = [ + module_name for module_name in sys.modules + if module_name.startswith('configs.datasets') + ] + + for module_name in modules_to_remove: + del sys.modules[module_name] + + with read_base(): + from configs.datasets.ceval.ceval_gen import ceval_datasets + from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets + from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets + from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets + from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets + from configs.datasets.humaneval.humaneval_gen import humaneval_datasets + from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets + from configs.datasets.race.race_ppl import race_datasets + from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets + + from configs.datasets.mmlu.mmlu_gen import mmlu_datasets + from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets + from configs.datasets.bbh.bbh_gen import bbh_datasets + from configs.datasets.Xsum.Xsum_gen import Xsum_datasets + from configs.datasets.winogrande.winogrande_gen import winogrande_datasets + from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets + from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets + from configs.datasets.obqa.obqa_gen import obqa_datasets + from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets + from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets + from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets + from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets + from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets + from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets + from configs.datasets.storycloze.storycloze_gen import storycloze_datasets + from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets + from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets + + from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets + from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets + from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets + from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets + from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets + from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets + from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets + from configs.datasets.nq.nq_gen import nq_datasets + from configs.datasets.lcsts.lcsts_gen import lcsts_datasets + from configs.datasets.math.math_gen import math_datasets + from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets + from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets + from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets + from configs.datasets.lambada.lambada_gen import lambada_datasets + from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets + from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets + from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets + from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets + from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets + from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets + + from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets + from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets + from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets + + from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets + from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets + from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets + from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets + + return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +def load_datasets_conf(source): + environ['DATASET_SOURCE'] = source + datasets_conf = reload_datasets() + return datasets_conf + + +def load_datasets(source, conf): + environ['DATASET_SOURCE'] = source + if 'lang' in conf: + dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) + return dataset + if 'setting_name' in conf: + dataset = conf['type'].load(path=conf['path'], + name=conf['name'], + setting_name=conf['setting_name']) + return dataset + if 'name' in conf: + dataset = conf['type'].load(path=conf['path'], name=conf['name']) + return dataset + try: + dataset = conf['type'].load(path=conf['path']) + except Exception: + dataset = conf['type'].load(**conf) + return dataset + + +def clean_string(value): + """Helper function to clean and normalize string data. + + It strips leading and trailing whitespace and replaces multiple whitespace + characters with a single space. + """ + if isinstance(value, str): + return ' '.join(value.split()) + return value + + +class TestingMsDatasets(unittest.TestCase): + + def test_datasets(self): + # 加载 ModelScope 和 Local 数据集配置 + ms_datasets_conf = load_datasets_conf('ModelScope') + local_datasets_conf = load_datasets_conf('Local') + + # 初始化成功和失败的数据集列表 + successful_comparisons = [] + failed_comparisons = [] + + def compare_datasets(ms_conf, local_conf): + modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" + local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" + # 断言类型一致 + assert ms_conf['type'] == local_conf['type'], "Data types do not match" + print(modelscope_path_name, local_path_name) + try: + ms_dataset = load_datasets('ModelScope', ms_conf) + local_dataset = load_datasets('Local', local_conf) + _check_data(ms_dataset, local_dataset, sample_size=sample_size) + return 'success', f'{modelscope_path_name} | {local_path_name}' + except Exception as exception: + print(exception) + return 'failure', f'{modelscope_path_name} is not the same as {local_path_name}' + + with ThreadPoolExecutor(16) as executor: + futures = { + executor.submit(compare_datasets, ms_conf, local_conf): (ms_conf, local_conf) + for ms_conf, local_conf in zip(ms_datasets_conf, local_datasets_conf) + } + + for future in tqdm(as_completed(futures), total=len(futures)): + result, message = future.result() + if result == 'success': + successful_comparisons.append(message) + else: + failed_comparisons.append(message) + + # 输出测试总结 + total_datasets = len(ms_datasets_conf) + print(f"All {total_datasets} datasets") + print(f"OK {len(successful_comparisons)} datasets") + for success in successful_comparisons: + print(f" {success}") + print(f"Fail {len(failed_comparisons)} datasets") + for failure in failed_comparisons: + print(f" {failure}") + + +def _check_data(ms_dataset: Dataset | DatasetDict, + oc_dataset: Dataset | DatasetDict, + sample_size): + assert type(ms_dataset) == type( + oc_dataset + ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' + + # match DatasetDict + if isinstance(oc_dataset, DatasetDict): + assert ms_dataset.keys() == oc_dataset.keys( + ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' + + for key in ms_dataset.keys(): + _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) + + elif isinstance(oc_dataset, Dataset): + # match by cols + assert set(ms_dataset.column_names) == set( + oc_dataset.column_names + ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' + + # Check that the number of rows is the same + assert len(ms_dataset) == len( + oc_dataset + ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' + + # Randomly sample indices + sample_indices = random.sample(range(len(ms_dataset)), + min(sample_size, len(ms_dataset))) + + for i, idx in enumerate(sample_indices): + for col in ms_dataset.column_names: + ms_value = clean_string(str(ms_dataset[col][idx])) + oc_value = clean_string(str(oc_dataset[col][idx])) + try: + assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" + except AssertionError as e: + print(f"Assertion failed for column '{col}', index {idx}") + print(f"ms_data: {ms_dataset[idx]}") + print(f'oc_data: {oc_dataset[idx]}') + print(f'ms_value: {ms_value} ({type(ms_value)})') + print(f'oc_value: {oc_value} ({type(oc_value)})') + raise e + else: + raise ValueError(f'Datasets type not supported {type(ms_dataset)}') + + +if __name__ == '__main__': + sample_size = 100 + unittest.main()