[Feature] Fullbench v0.1 language update (#1463)

* update * update * update * update
2025-05-30 16:03:24 +08:00 · 2024-08-28 14:01:05 +08:00 · 2024-08-28 14:01:05 +08:00 · 245664f4c0
commit 245664f4c0
parent 463231c651
35 changed files with 861 additions and 20 deletions
--- a/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
+++ b/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
@ -0,0 +1,53 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ARCDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
 QUERY_TEMPLATE = """
 Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
 {question}
 A. {textA}
 B. {textB}
 C. {textC}
 D. {textD}
 """.strip()
 ARC_c_reader_cfg = dict(
    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
    output_column='answerKey')
 ARC_c_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt=QUERY_TEMPLATE)
            ], ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ARC_c_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
 )
 ARC_c_datasets = [
    dict(
        abbr='ARC-c',
        type=ARCDataset,
        path='opencompass/ai2_arc-dev',
        name='ARC-Challenge',
        reader_cfg=ARC_c_reader_cfg,
        infer_cfg=ARC_c_infer_cfg,
        eval_cfg=ARC_c_eval_cfg,
    )
 ]
--- a/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
+++ b/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
@ -0,0 +1,48 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ARCDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 ARC_c_reader_cfg = dict(
    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
    output_column='answerKey',
 )
 ARC_c_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
                ),
                dict(role='BOT', prompt='{answerKey}'),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 ARC_c_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_capital_postprocess),
 )
 ARC_c_datasets = [
    dict(
        abbr='ARC-c',
        type=ARCDataset,
        path='opencompass/ai2_arc-dev',
        name='ARC-Challenge',
        reader_cfg=ARC_c_reader_cfg,
        infer_cfg=ARC_c_infer_cfg,
        eval_cfg=ARC_c_eval_cfg,
    )
 ]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
@ -0,0 +1,55 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 from opencompass.utils.text_postprocessors import (
    first_option_postprocess,
 )
 QUERY_TEMPLATE = """
 Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
 Passage: {passage}
 Question: {question}
 A. Yes
 B. NO
 """.strip()
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 BoolQ_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
 )
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
@ -0,0 +1,47 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
                ),
                dict(role='BOT', prompt='{label}'),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 BoolQ_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_capital_postprocess),
 )
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
@ -33,7 +33,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
@ -0,0 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template={
            'A':
            dict(round=[
                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
                dict(role='BOT', prompt='Yes'),
            ]),
            'B':
            dict(round=[
                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
                dict(role='BOT', prompt='No'),
            ]),
        },
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLInferencer),
 )
 BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
@ -35,7 +35,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV3,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
+++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
@ -26,7 +26,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/configs/datasets/race/race_cot_gen_d95929.py
+++ b/configs/datasets/race/race_cot_gen_d95929.py
@ -0,0 +1,68 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import RaceDataset
 from opencompass.utils.text_postprocessors import (
    first_option_postprocess,
 )
 QUERY_TEMPLATE = """
 Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
 Article: {article}
 Q: {question}
 A. {A}
 B. {B}
 C. {C}
 D. {D}
 """.strip()
 race_reader_cfg = dict(
    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
    output_column='answer',
    train_split='validation',
    test_split='test',
 )
 race_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 race_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
    pred_role='BOT',
 )
 race_datasets = [
    dict(
        abbr='race-middle',
        type=RaceDataset,
        path='opencompass/race',
        name='middle',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg,
    ),
    dict(
        abbr='race-high',
        type=RaceDataset,
        path='opencompass/race',
        name='high',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg,
    ),
 ]
--- a/configs/datasets/race/race_few_shot_gen_a498ed.py
+++ b/configs/datasets/race/race_few_shot_gen_a498ed.py
@ -0,0 +1,53 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import RaceDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 race_reader_cfg = dict(
    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
    output_column='answer',
    train_split='validation',
    test_split='test'
 )
 race_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(role='HUMAN', prompt='Read the article, and answer the question by replying A, B, C or D.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:'),
                dict(role='BOT', prompt='{answer}'),
            ]
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 race_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=first_capital_postprocess),
    pred_role='BOT')
 race_datasets = [
    dict(
        abbr='race-middle',
        type=RaceDataset,
        path='opencompass/race',
        name='middle',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg),
    dict(
        abbr='race-high',
        type=RaceDataset,
        path='opencompass/race',
        name='high',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg)
 ]
--- a/configs/models/chatglm/lmdeploy_glm4_9b_chat.py
+++ b/configs/models/chatglm/lmdeploy_glm4_9b_chat.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='glm-4-9b-chat-turbomind',
        path='THUDM/glm-4-9b-chat',
        engine_config=dict(max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=8192,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/hf_llama/lmdeploy_llama3_1_8b.py
+++ b/configs/models/hf_llama/lmdeploy_llama3_1_8b.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModel
 models = [
    dict(
        type=TurboMindModel,
        abbr='llama-3.1-8b-turbomind',
        path='meta-llama/Meta-Llama-3.1-8B',
        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=7168,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py
+++ b/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py
@ -0,0 +1,16 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='llama-3.1-8b-instruct-turbomind',
        path='meta-llama/Meta-Llama-3.1-8B-Instruct',
        engine_config=dict(max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=7168,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
        stop_words=['<|end_of_text|>', '<|eot_id|>'],
    )
 ]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
@ -0,0 +1,53 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ARCDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
 QUERY_TEMPLATE = """
 Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
 {question}
 A. {textA}
 B. {textB}
 C. {textC}
 D. {textD}
 """.strip()
 ARC_c_reader_cfg = dict(
    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
    output_column='answerKey')
 ARC_c_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt=QUERY_TEMPLATE)
            ], ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ARC_c_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
 )
 ARC_c_datasets = [
    dict(
        abbr='ARC-c',
        type=ARCDataset,
        path='opencompass/ai2_arc-dev',
        name='ARC-Challenge',
        reader_cfg=ARC_c_reader_cfg,
        infer_cfg=ARC_c_infer_cfg,
        eval_cfg=ARC_c_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
@ -0,0 +1,48 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ARCDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 ARC_c_reader_cfg = dict(
    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
    output_column='answerKey',
 )
 ARC_c_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
                ),
                dict(role='BOT', prompt='{answerKey}'),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 ARC_c_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_capital_postprocess),
 )
 ARC_c_datasets = [
    dict(
        abbr='ARC-c',
        type=ARCDataset,
        path='opencompass/ai2_arc-dev',
        name='ARC-Challenge',
        reader_cfg=ARC_c_reader_cfg,
        infer_cfg=ARC_c_infer_cfg,
        eval_cfg=ARC_c_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
@ -0,0 +1,55 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 from opencompass.utils.text_postprocessors import (
    first_option_postprocess,
 )
 QUERY_TEMPLATE = """
 Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
 Passage: {passage}
 Question: {question}
 A. Yes
 B. NO
 """.strip()
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 BoolQ_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
 )
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
@ -0,0 +1,47 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(
                    role='HUMAN',
                    prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
                ),
                dict(role='BOT', prompt='{label}'),
            ],
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 BoolQ_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=first_capital_postprocess),
 )
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
@ -33,7 +33,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
@ -0,0 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BoolQDatasetV2
 BoolQ_reader_cfg = dict(
    input_columns=['question', 'passage'],
    output_column='label',
 )
 BoolQ_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template={
            'A':
            dict(round=[
                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
                dict(role='BOT', prompt='Yes'),
            ]),
            'B':
            dict(round=[
                dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
                dict(role='BOT', prompt='No'),
            ]),
        },
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLInferencer),
 )
 BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
 BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV2,
        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
@ -35,7 +35,7 @@ BoolQ_datasets = [
    dict(
        abbr='BoolQ',
        type=BoolQDatasetV3,
-        path='./data/SuperGLUE/BoolQ/val.jsonl',
+        path='opencompass/boolq',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
        eval_cfg=BoolQ_eval_cfg,
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
@ -36,7 +36,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
+++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
@ -26,7 +26,7 @@ BoolQ_datasets = [
        type=BoolQDataset,
        abbr='BoolQ',
        path='json',
-        data_files='./data/SuperGLUE/BoolQ/val.jsonl',
+        data_files='opencompass/boolq',
        split='train',
        reader_cfg=BoolQ_reader_cfg,
        infer_cfg=BoolQ_infer_cfg,
--- a/opencompass/configs/datasets/race/race_cot_gen_d95929.py
+++ b/opencompass/configs/datasets/race/race_cot_gen_d95929.py
@ -0,0 +1,68 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import RaceDataset
 from opencompass.utils.text_postprocessors import (
    first_option_postprocess,
 )
 QUERY_TEMPLATE = """
 Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
 Article: {article}
 Q: {question}
 A. {A}
 B. {B}
 C. {C}
 D. {D}
 """.strip()
 race_reader_cfg = dict(
    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
    output_column='answer',
    train_split='validation',
    test_split='test',
 )
 race_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 race_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
    pred_role='BOT',
 )
 race_datasets = [
    dict(
        abbr='race-middle',
        type=RaceDataset,
        path='opencompass/race',
        name='middle',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg,
    ),
    dict(
        abbr='race-high',
        type=RaceDataset,
        path='opencompass/race',
        name='high',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg,
    ),
 ]
--- a/opencompass/configs/datasets/race/race_few_shot_gen_a498ed.py
+++ b/opencompass/configs/datasets/race/race_few_shot_gen_a498ed.py
@ -0,0 +1,53 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import RaceDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 race_reader_cfg = dict(
    input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
    output_column='answer',
    train_split='validation',
    test_split='test'
 )
 race_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template=dict(
            begin='</E>',
            round=[
                dict(role='HUMAN', prompt='Read the article, and answer the question by replying A, B, C or D.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:'),
                dict(role='BOT', prompt='{answer}'),
            ]
        ),
        ice_token='</E>',
    ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4]),
    inferencer=dict(type=GenInferencer, max_out_len=50),
 )
 race_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=first_capital_postprocess),
    pred_role='BOT')
 race_datasets = [
    dict(
        abbr='race-middle',
        type=RaceDataset,
        path='opencompass/race',
        name='middle',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg),
    dict(
        abbr='race-high',
        type=RaceDataset,
        path='opencompass/race',
        name='high',
        reader_cfg=race_reader_cfg,
        infer_cfg=race_infer_cfg,
        eval_cfg=race_eval_cfg)
 ]
--- a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py
+++ b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='glm-4-9b-chat-turbomind',
        path='THUDM/glm-4-9b-chat',
        engine_config=dict(max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=8192,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b.py
+++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModel
 models = [
    dict(
        type=TurboMindModel,
        abbr='llama-3.1-8b-turbomind',
        path='meta-llama/Meta-Llama-3.1-8B',
        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=7168,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py
+++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py
@ -0,0 +1,16 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='llama-3.1-8b-instruct-turbomind',
        path='meta-llama/Meta-Llama-3.1-8B-Instruct',
        engine_config=dict(max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=7168,
        max_out_len=1024,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
        stop_words=['<|end_of_text|>', '<|eot_id|>'],
    )
 ]
--- a/opencompass/datasets/boolq.py
+++ b/opencompass/datasets/boolq.py
@ -31,7 +31,7 @@ class BoolQDatasetV2(BaseDataset):
    @staticmethod
    def load(path):
-        path = get_data_path(path, local_mode=True)
+        path = get_data_path(path)
        dataset = []
        with open(path, 'r') as f:
            for line in f:
--- a/opencompass/openicl/icl_retriever/init.py
+++ b/opencompass/openicl/icl_retriever/init.py
@ -4,7 +4,7 @@ from .icl_dpp_retriever import DPPRetriever  # noqa
 from .icl_fix_k_retriever import FixKRetriever  # noqa
 from .icl_mdl_retriever import MDLRetriever  # noqa
 from .icl_random_retriever import RandomRetriever  # noqa
 from .icl_sliding_k_retriever import SlidingWindowRetriever  # noqa
 from .icl_topk_retriever import TopkRetriever  # noqa
 from .icl_votek_retriever import VotekRetriever  # noqa
 from .icl_zero_retriever import ZeroRetriever  # noqa
 from .icl_sliding_k_retriever import SlidingWindowRetriever  # noqa
--- a/opencompass/openicl/icl_retriever/icl_sliding_k_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_sliding_k_retriever.py
@ -51,8 +51,8 @@ class SlidingWindowRetriever(BaseRetriever):
        for current_index in trange(len(self.test_ds),
                                    disable=not self.is_main_process):
            if current_index < self.k:
-                """For the first few examples,
+                """For the first few examples, get the previous ones and pad
-                get the previous ones and pad with the last ones"""
+                with the last ones."""
                start_index = max(0, current_index - self.k)
                previous_shots = list(range(start_index, current_index))
                if len(previous_shots) < self.k:
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -203,7 +203,7 @@ DATASETS_MAPPING = {
    "opencompass/race": {
        "ms_id": "opencompass/race",
        "hf_id": "opencompass/race",
-        "local": "./data/race",
+        "local": "./data/race/",
    },
    # SIQA
    "opencompass/siqa": {
@ -229,6 +229,12 @@ DATASETS_MAPPING = {
        "hf_id": "opencompass/summedits",
        "local": "./data/summedits/summedits.jsonl",
    },
    # SuperGLUE
    "opencompass/boolq": {
        "ms_id": "opencompass/boolq",
        "hf_id": "opencompass/boolq",
        "local": "./data/SuperGLUE/BoolQ/val.jsonl",
    },
    # TriviaQA
    "opencompass/trivia_qa": {
        "ms_id": "opencompass/trivia_qa",
@ -292,10 +298,6 @@ DATASETS_URL = {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip",
        "md5": "60c49f9bef5148aa7e1941328e96a554",
    },
    "/mmlu/": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip",
        "md5": "761310671509a239e41c4b717f7fab9c",
    },
    "/compass_arena/": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip",
        "md5": "cd59b54a179d16f2a858b359b60588f6",
@ -367,5 +369,17 @@ DATASETS_URL = {
    "FewCLUE": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip",
        "md5": "7976e2bb0e9d885ffd3c55f7c5d4021e",
-    }
+    },
    "/race": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip",
        "md5": "b758251764a264746cf45749c02363f9",
    },
    "/ARC": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip",
        "md5": "d720629b69f1a51cfe78bf65b00b44f6",
    },
    "/SuperGLUE": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip",
        "md5": "b60904915b0b61d1a04ea52280169936",
    },
 }
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -98,6 +98,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
        f'答案是\s?(\S+)(?:。|$)',
        f'答案应该是\s?(\S+)(?:。|$)',
        f'答案为\s?(\S+)(?:。|$)',
        f'(?i)ANSWER\s*:\s*([{options}])',
        f'[Tt]he answer is:?\s+\(?([{options}])\)?',
        f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
        f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',