[Sync] minor test (#683)

2025-05-30 16:03:24 +08:00 · 2023-12-11 17:42:53 +08:00 · 2023-12-11 17:42:53 +08:00 · e78857ac36
commit e78857ac36
parent dd4318f6ab
57 changed files with 1468 additions and 314 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
 models
 configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/configs/datasets/CIBench/CIBench_gen.py
+++ b/configs/datasets/CIBench/CIBench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .CIBench_gen_eb42f9 import ci_datasets  # noqa: F401, F403
+    from .CIBench_gen_8ab0dc import ci_datasets  # noqa: F401, F403
--- a/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
+++ b/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
@ -16,28 +16,20 @@ cibench_infer_cfg = dict(
        template="""{questions}""",
    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=AgentInferencer),
+    inferencer=dict(type=AgentInferencer, infer_mode='every'),
 )
 libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
-cibench_eval_cfg = {
+cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
    lib: dict(
        evaluator=dict(
            type=CIBenchEvaluator,
            output_dir=f'output_data/cibench/{lib}'),
        pred_role="BOT",
    )
    for lib in libs
 }
 cibench_datasets = [
    dict(
-        abbr=f"cibench_{lib}",
+        abbr=f"cibench_generation_{lib}",
        type=CIBenchDataset,
        path=f"./data/cibench/{lib}",
        reader_cfg=cibench_reader_cfg,
        infer_cfg=cibench_infer_cfg,
-        eval_cfg=cibench_eval_cfg[lib],
+        eval_cfg=cibench_eval_cfg,
    ) for lib in libs
 ]
--- a/configs/datasets/MathBench/mathbench_agent_gen_568903.py
+++ b/configs/datasets/MathBench/mathbench_agent_gen_568903.py
@ -95,7 +95,7 @@ mathbench_sets = {
 # Use circular evaluation or not
 with_circular_eval = True
-mathbench_code_datasets = []
+mathbench_agent_datasets = []
 for _split in list(mathbench_sets.keys()):
    for _name in mathbench_sets[_split]:
@ -112,13 +112,13 @@ for _split in list(mathbench_sets.keys()):
            evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
-        mathbench_code_datasets.append(
+        mathbench_agent_datasets.append(
            dict(
                abbr="mathbench-" + _split + '-' + _name + '-agent',
                type=MathBenchDataset,
                path=f"./data/mathbench/{_split}",
                name=_name,
                with_circular=with_circular_eval,
                abbr="mathbench-interpreter-" + _split + '-' + _name,
                reader_cfg=dict(
                    input_columns=["question"],
                    output_column="answer"
--- a/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py
+++ b/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py
@ -6,17 +6,17 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
 cloze_prompts ={
    "cloze_arith_en": [
-                dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
+        dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
-                dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
+        dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
-                dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
+        dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
-                dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
+        dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
-                dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
+        dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
-                dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
+        dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
-                dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'), 
+        dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
-                dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),    
+        dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
-                dict(role='HUMAN', prompt='Q: {question}'),
+        dict(role='HUMAN', prompt='Q: {question}'),
-                dict(role='BOT', prompt='A: {answer}\n'),
+        dict(role='BOT', prompt='A: {answer}\n'),
-]
+    ]
 }
 mathbench_sets = {
--- a/configs/datasets/MathBench/mathbench_gen_ad37c1.py
+++ b/configs/datasets/MathBench/mathbench_gen_ad37c1.py
@ -94,11 +94,11 @@ for _split in list(mathbench_sets.keys()):
        mathbench_datasets.append(
            dict(
                abbr="mathbench-" + _split + '-' + _name,
                type=MathBenchDataset,
                path=f"./data/mathbench/{_split}",
                name=_name,
                with_circular=with_circular_eval,
                abbr="mathbench-" + _split + '-' + _name,
                reader_cfg=dict(
                    input_columns=["question"],
                    output_column="answer"
--- a/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
@ -0,0 +1,69 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
                                  ds1000_matplotlib_postprocess,
                                  DS1000Evaluator)
 ds1000_reader_cfg = dict(
    input_columns=["prompt"],
    output_column="test_column",
    train_split='test',
    test_split='test')
 ds1000_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role="HUMAN",
                prompt="{prompt}",
            ),
        ]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ds1000_eval_cfg = dict(
    evaluator=dict(type=DS1000Evaluator),
    pred_role="BOT",
    pred_postprocessor=dict(type=ds1000_completion_postprocess),
 )
 # The DS-1000 dataset can be downloaded from
 # https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
 ds1000_datasets = [
    dict(
        abbr=f"ds1000_{lib}",
        type=DS1000Dataset,
        path="./data/ds1000_data/",
        libs=f"{lib}",
        mode="Completion",
        reader_cfg=ds1000_reader_cfg,
        infer_cfg=ds1000_infer_cfg,
        eval_cfg=ds1000_eval_cfg,
    ) for lib in [
        'Pandas',
        'Numpy',
        'Tensorflow',
        'Scipy',
        'Sklearn',
        'Pytorch',
    ]
 ]
 ds1000_datasets.append(
    dict(
        abbr="ds1000_Matplotlib",
        type=DS1000Dataset,
        path="./data/ds1000_data/",
        libs="Matplotlib",
        mode="Completion",
        reader_cfg=ds1000_reader_cfg,
        infer_cfg=ds1000_infer_cfg,
        eval_cfg=dict(
            evaluator=dict(type=DS1000Evaluator),
            pred_role="BOT",
            pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
        ),
    ))
--- a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
@ -0,0 +1,68 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
 ds1000_reader_cfg = dict(
    input_columns=["prompt"],
    output_column="test_column",
    train_split='test',
    test_split='test')
 ds1000_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role="HUMAN",
                prompt="{prompt}",
            ),
        ]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ds1000_eval_cfg_dict = {
    lib: dict(
        evaluator=dict(
            type=DS1000ServiceEvaluator,
            lib=lib,
            ip_address=
            "localhost",  # replace to your code_eval_server ip_address, port
            port=5000
            ),
        pred_role="BOT")
    for lib in [
        'Pandas',
        'Numpy',
        'Tensorflow',
        'Scipy',
        'Sklearn',
        'Pytorch',
        'Matplotlib',
    ]
 }
 # The DS-1000 dataset can be downloaded from
 # https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
 ds1000_datasets = [
    dict(
        abbr=f"ds1000_{lib}",
        type=DS1000Dataset,
        path="./data/ds1000_data/",
        libs=f"{lib}",
        mode="Completion",
        reader_cfg=ds1000_reader_cfg,
        infer_cfg=ds1000_infer_cfg,
        eval_cfg=ds1000_eval_cfg_dict[lib],
    ) for lib in [
        'Pandas',
        'Numpy',
        'Tensorflow',
        'Scipy',
        'Sklearn',
        'Pytorch',
        'Matplotlib',
    ]
 ]
--- a/configs/datasets/gsm8k/gsm8k_agent_gen_3ac57d.py
+++ b/configs/datasets/gsm8k/gsm8k_agent_gen_3ac57d.py
@ -45,7 +45,7 @@ gsm8k_eval_cfg = dict(
 gsm8k_datasets = [
    dict(
-        abbr='gsm8k',
+        abbr='gsm8k-agent',
        type=GSM8KDataset,
        path='./data/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
--- a/configs/datasets/gsm8k/gsm8k_gen_3309bd.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_3309bd.py
@ -0,0 +1,39 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 gsm8k_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
                dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
                dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
                dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
                dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
                dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
            ],
        )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=[":", "Question:", "Question"]))
 gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                      pred_postprocessor=dict(type=gsm8k_postprocess),
                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
 gsm8k_datasets = [
    dict(
        abbr='gsm8k',
        type=GSM8KDataset,
        path='./data/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
        infer_cfg=gsm8k_infer_cfg,
        eval_cfg=gsm8k_eval_cfg)
 ]
--- a/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py
+++ b/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py
@ -0,0 +1,57 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
 from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
 from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
 gsm8k_datasets = []
 gsm8k_infer_cfg = dict(
    prompt_template=dict(type=PromptTemplate, template="{question} {answer}"),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLOnlyInferencer),
 )
 gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
 for split in ['train', 'test']:
    gsm8k_reader_cfg = dict(
        input_columns=['question', 'answer'],
        output_column=None,
        train_split=split,
        test_split=split,
    )
    gsm8k_datasets.append(
        dict(
            abbr=f'gsm8k-{split}-ppl',
            type=GSM8KDataset,
            path='./data/gsm8k',
            reader_cfg=gsm8k_reader_cfg,
            infer_cfg=gsm8k_infer_cfg,
            eval_cfg=gsm8k_eval_cfg)
    )
 gsm8k_infer_cfg = dict(
    prompt_template=dict(type=PromptTemplate, template="{text}"),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLOnlyInferencer),
 )
 gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
 gsm8k_reader_cfg = dict(
    input_columns=['text'],
    output_column=None,
 )
 gsm8k_datasets.append(
    dict(
        abbr=f'gsm8k-ref-ppl',
        type=GSM8KReferenceSkywork,
        path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
        reader_cfg=gsm8k_reader_cfg,
        infer_cfg=gsm8k_infer_cfg,
        eval_cfg=gsm8k_eval_cfg
    )
 )
--- a/configs/datasets/math/math_agent_gen_861b4f.py
+++ b/configs/datasets/math/math_agent_gen_861b4f.py
@ -79,7 +79,7 @@ math_eval_cfg = dict(
 math_datasets = [
    dict(
-        abbr='math',
+        abbr='math-agent',
        type=MATHDataset,
        path='./data/math/math.json',
        reader_cfg=math_reader_cfg,
--- a/configs/datasets/winogrande/winogrande_ppl.py
+++ b/configs/datasets/winogrande/winogrande_ppl.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .winogrande_ppl_55a66e import winogrande_datasets  # noqa: F401, F403
+    from .winogrande_ppl_8be6c3 import winogrande_datasets  # noqa: F401, F403
--- a/configs/datasets/winogrande/winogrande_ppl_55a66e.py
+++ b/configs/datasets/winogrande/winogrande_ppl_55a66e.py
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import winograndeDataset
 # WARNING: This config cannot reproduce results in the paper.
 # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
 # Please try winogrande_ppl_8be6c3
 winogrande_reader_cfg = dict(
    input_columns=['opt1', 'opt2'],
    output_column='answer',
--- a/configs/datasets/winogrande/winogrande_ppl_8be6c3.py
+++ b/configs/datasets/winogrande/winogrande_ppl_8be6c3.py
@ -0,0 +1,33 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import winograndeDataset
 winogrande_reader_cfg = dict(
    input_columns=['opt1', 'opt2'],
    output_column='answer',
 )
 winogrande_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template={
            1: "{opt1}",
            2: "{opt2}",
        }
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=LoglikelihoodInferencer))
 winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
 winogrande_datasets = [
    dict(
        abbr='winogrande',
        type=winograndeDataset,
        path='./data/winogrande',
        reader_cfg=winogrande_reader_cfg,
        infer_cfg=winogrande_infer_cfg,
        eval_cfg=winogrande_eval_cfg)
 ]
--- a/configs/datasets/winogrande/winogrande_ppl_9307fd.py
+++ b/configs/datasets/winogrande/winogrande_ppl_9307fd.py
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import winograndeDataset
 # WARNING: This config cannot reproduce results in the paper.
 # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
 # Please try winogrande_ppl_8be6c3
 winogrande_reader_cfg = dict(
    input_columns=['opt1', 'opt2'],
    output_column='answer',
--- a/configs/eval_chat_agent.py
+++ b/configs/eval_chat_agent.py
@ -4,11 +4,20 @@ from opencompass.partitioners import SizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.models.lagent import LagentAgent
-from lagent import PythonInterpreter, ReAct
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
 from lagent import ReAct
 from lagent.agents.react import ReActProtocol
 with read_base():
-    from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets as datasets
+    from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets
    from .datasets.math.math_agent_gen_861b4f import math_datasets
    from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
    from .summarizers.math_agent import summarizer
 datasets = []
 datasets += gsm8k_datasets
 datasets += math_datasets
 datasets += mathbench_agent_datasets
 system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
 ```
--- a/configs/eval_chat_cibench.py
+++ b/configs/eval_chat_cibench.py
@ -10,7 +10,7 @@ from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
-    from .datasets.CIBench.CIBench_gen_eb42f9 import \
+    from .datasets.CIBench.CIBench_gen_8ab0dc import \
        cibench_datasets as datasets
 FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
@ -36,7 +36,21 @@ Also please follow the guidelines:
 3. The generated codes will be executed in an ipython manner and the results will be cached.
 4. Your responded code should always be simple and only solves the problem in current step.
-Begin!
+For example:
 File url: `xxxx`
 ### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
 {thought} We should use `pandas` to solve this step.
 {action} IPythonInterpreter
 {action_input} ```python
 import pandas as pd
 url = "xxxx"
 data = pd.read_csv(url)
 ```
 {response} The code is succeed without any outputs.
 Let us begin from here!
 """
 IPYTHON_INTERPRETER_DESCRIPTION = '''\
@ -69,9 +83,6 @@ models = [
    ),
 ]
 for dataset in datasets:
    # Evaluate on every assistant response
    dataset['infer_cfg']['inferencer']['infer_mode'] = 'every'
 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=1000),
--- a/configs/eval_code_mathbench.py
+++ b/configs/eval_code_mathbench.py
@ -1,56 +0,0 @@
 from mmengine.config import read_base
 from opencompass.models.openai_api import OpenAI
 from opencompass.partitioners import SizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.models.lagent import LagentAgent
 from lagent import PythonInterpreter, ReAct
 from lagent.agents.react import ReActProtocol
 system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
 ```
 def solution():
    variable_names_with_real_meaning = func(variable)
    return variable_names_with_real_meaning
 ```"""
 protocol = dict(
    type=ReActProtocol,
    action=dict(role="ACTION", begin="Tool:", end="\n"),
    action_input=dict(role="ARGS", begin="Tool Input:", end="\n"),
    finish=dict(role="FINISH", begin="FinalAnswer:", end="\n"),
    call_protocol=system_prompt,
 )
 with read_base():
    from .datasets.MathBench.mathbench_code_gen_568903 import mathbench_code_datasets as datasets
    from .summarizers.mathbench import summarizer
 models = [
    dict(
        abbr='gpt-3.5-react',
        type=LagentAgent,
        agent_type=ReAct,
        max_turn=3,
        llm=dict(
            type=OpenAI,
            path='gpt-3.5-turbo',
            key='ENV',
            query_per_second=1,
            max_seq_len=4096,
        ),
        actions=[
            dict(type=PythonInterpreter),
        ],
        protocol=protocol,
        batch_size=1,
    ),
 ]
 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=1000),
    runner=dict(
        type=LocalRunner,
        max_num_workers=16,
        task=dict(type=OpenICLInferTask)),
 )
--- a/configs/eval_with_model_dataset_combinations.py
+++ b/configs/eval_with_model_dataset_combinations.py
@ -0,0 +1,43 @@
 from mmengine.config import read_base
 with read_base():
    from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_base_models
    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_models
    from .datasets.ceval.ceval_ppl_578f8d import ceval_datasets as base_ceval_datasets
    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets as chat_ceval_datasets
    from .internal.clusters.slurm import infer, eval
    # from .clusters.slurm import infer_split as infer, eval
    # from .clusters.slurm import infer_size as infer, eval
    # from .clusters.slurm import infer_size_split as infer, eval
 base_ceval_datasets = base_ceval_datasets[:1]
 chat_ceval_datasets = chat_ceval_datasets[-1:]
 # If you do not want to run all the combinations of models and datasets, you
 # can specify the combinations you want to run here. This is useful when you
 # deleberately want to skip some subset of the combinations.
 # Models and datasets in different combinations are recommended to be disjoint
 # (different `abbr` in model & dataset configs), as we haven't tested this case
 # throughly.
 model_dataset_combinations = [
    dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
    dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
    # dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
 ]
 # This union of models and datasets in model_dataset_combinations should be
 # stored in the `models` and `datasets` variables below. Otherwise, modules
 # like summarizer will miss out some information.
 models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
 datasets = [*base_ceval_datasets, *chat_ceval_datasets]
 work_dir = './outputs/default/mdcomb/'
 """
 dataset                 version    metric    mode    qwen-7b-hf    qwen-7b-chat-hf
 ----------------------  ---------  --------  ------  ------------  -----------------
 ceval-computer_network  9b9417     accuracy  ppl     52.63         -
 ceval-physician         6e277d     accuracy  gen     -             59.18
 """
--- a/configs/models/hf_internlm/hf_internlm_chat_20b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_20b.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=2, num_procs=1),
        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
        end_str='<eoa>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
        end_str='<|im_end|>',
    )
 ]
--- a/configs/summarizers/groups/cibench.py
+++ b/configs/summarizers/groups/cibench.py
@ -0,0 +1,4 @@
 _cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
 _cibench = ['cibench_' + i for i in _cibench]
 cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
--- a/configs/summarizers/groups/mathbench.py
+++ b/configs/summarizers/groups/mathbench.py
@ -0,0 +1,75 @@
 mathbench_summary_groups = [
    {
        'name': 'mathbench-college',
        'subsets': [
            ['mathbench-college-single_choice_cn', 'acc_1'],
            ['mathbench-college-cloze_en', 'accuracy'],
        ]
    },
    {
        'name': 'mathbench-high',
        'subsets': [
            ['mathbench-high-single_choice_cn', 'acc_1'],
            ['mathbench-high-single_choice_en', 'acc_1'],
        ]
    },
    {
        'name': 'mathbench-middle',
        'subsets': [
            ['mathbench-middle-single_choice_cn', 'acc_1'],
        ]
    },
    {
        'name': 'mathbench-primary',
        'subsets': [
            ['mathbench-primary-cloze_cn', 'accuracy'],
        ]
    },
    {
        'name': 'mathbench',
        'subsets': [
            'mathbench-college',
            'mathbench-high',
            'mathbench-middle',
            'mathbench-primary',
        ],
    },
    {
        'name': 'mathbench-college-circular',
        'subsets': [
            ['mathbench-college-single_choice_cn', 'perf_4'],
        ]
    },
    {
        'name': 'mathbench-high-circular',
        'subsets': [
            ['mathbench-high-single_choice_cn', 'perf_4'],
            ['mathbench-high-single_choice_en', 'perf_4'],
        ]
    },
    {
        'name': 'mathbench-middle-circular',
        'subsets': [
            ['mathbench-middle-single_choice_cn', 'perf_4'],
        ]
    },
    {
        'name': 'mathbench-circular',
        'subsets': [
            'mathbench-college-circular',
            'mathbench-high-circular',
            'mathbench-middle-circular',
        ],
    },
    {
        'name': 'mathbench-circular-and-cloze',
        'subsets': [
            'mathbench-high-circular',
            'mathbench-middle-circular',
            'mathbench-circular',
            'mathbench-college-cloze_en',
            'mathbench-primary-cloze_cn',
        ],
    }
 ]
--- a/configs/summarizers/math_agent.py
+++ b/configs/summarizers/math_agent.py
@ -0,0 +1,28 @@
 summarizer = dict(
    dataset_abbrs=[
        '######## GSM8K-Agent Accuracy ########', # category
        ['gsm8k-agent', 'follow_acc'],
        ['gsm8k-agent', 'reasoning_acc'],
        ['gsm8k-agent', 'code_acc'],
        ['gsm8k-agent', 'action_pct'],
        '######## MATH-Agent Accuracy ########', # category
        ['math-agent', 'follow_acc'],
        ['math-agent', 'reasoning_acc'],
        ['math-agent', 'code_acc'],
        ['math-agent', 'action_pct'],
        '######## MathBench-Agent Accuracy ########', # category
        ['mathbench-college-single_choice_cn-agent', 'acc_1'],
        ['mathbench-college-cloze_en-agent', 'accuracy'],
        ['mathbench-high-single_choice_cn-agent', 'acc_1'],
        ['mathbench-high-single_choice_en-agent', 'acc_1'],
        ['mathbench-middle-single_choice_cn-agent', 'acc_1'],
        ['mathbench-primary-cloze_cn-agent', 'accuracy'],
        '######## MathBench-Agent CircularEval ########', # category
        ['mathbench-college-single_choice_cn-agent', 'perf_4'],
        ['mathbench-high-single_choice_cn-agent', 'perf_4'],
        ['mathbench-high-single_choice_en-agent', 'perf_4'],
        ['mathbench-middle-single_choice_cn-agent', 'perf_4'],
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
 )
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
@ -2,13 +2,15 @@ import json
 import os
 import os.path as osp
 import re
 import subprocess
 from collections import defaultdict
 from typing import List, Optional
 import numpy as np
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from .base import BaseDataset
@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
    with open(file, 'r') as f:
        notebook = json.load(f)
        example = notebook['cells']
-
+        metadata = notebook['metadata']
        modules = metadata.get('modules', [])
        if modules:
            # these two annotations should be the same
            assert len(modules) == len(metadata.get('step_types'))
            # reformat annotations
            modules = [[_m.strip() for _m in _modules.split('&')]
                       for _modules in modules]
        questions = []
        source_codes = []
        outputs = []
        tags = []
        for cell in example:
            if cell['cell_type'] == 'markdown':
-                text = ''.join(cell['source'])
+                text = ''.join(cell['source']).strip()
                if modules:
                    _modules = modules.pop(0)
                    text += f"Please use {' and '.join(_modules)} modules."
                text = text.strip() + '\n'
                # append the formatted text
                questions.append(text)
            elif cell['cell_type'] == 'code':
                source_codes.append(''.join(cell['source']))
                if cell['outputs'] and 'data' in cell['outputs'][-1]:
                    if 'image/png' in cell['outputs'][-1]['data']:
                        # skip vis temporarily due to lack of evaluation
@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
                        outputs.append(''.join(
                            cell['outputs'][-1]['data']['text/plain']))
                else:
-                    tags.append('executable')
+                    tags.append('exec')
                    outputs.append(None)
    return dict(
        experiment=file,
        questions=sum(([
            dict(role='user', content=question),
-            dict(role='assistant', content=output)
+            dict(role='assistant', content=source_code)
-        ] for question, output in zip(questions, outputs)), []),
+        ] for question, source_code in zip(questions, source_codes)), []),
-        references=dict(outputs=outputs, tags=tags, experiment=file),
+        references=dict(outputs=outputs,
                        tags=tags,
                        metadata=metadata,
                        experiment=file),
    )
@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        """Load whole dataset."""
        assert os.path.exists(path), f'Path {path} does not exist.'
        data_list = []
        for cwd, dirs, files in os.walk(path):
            dirs.sort()
@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
    """Evaluator for CI dataset.
    Args:
        text_evaluator (optional, dict): The text evaluator for text result
            comparison[]. Defaults to None, which use Rouge as defaults.
            Please notice that a extra key for `metric_name` should be set
            to get the exact metric result, such as `rouge1`.
        output_dir (optional, str): The directory to save experiment
            files in a markdown or notebook format.
        with_ipynb (bool): Generate ipynb correspondingly.
            Defaults to False.
        user_data_dir (str): The directory to load local files.
            Defaults to 'ENV', which means use environment variable
            `USER_DATA_DIR` to get the data dir.
    """
    def __init__(self,
                 text_evaluator: Optional[dict] = None,
                 output_dir: Optional[str] = None,
                 with_ipynb: bool = False,
                 user_data_dir: str = 'ENV') -> None:
        if text_evaluator is None:
            from opencompass.openicl.icl_evaluator import RougeEvaluator
            self.text_evaluator = ICL_EVALUATORS.build(
                dict(type=RougeEvaluator))
            self.text_eval_metric = 'rouge1'
        else:
            self.text_eval_metric = text_evaluator.pop('metric_name')
            self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
        # TODO: should use work dir for this task.
        self.output_dir = output_dir
        self.user_data_dir = self.check_user_data_dir(user_data_dir)
        self.with_ipynb = with_ipynb
        self.TAG_MAPPING = {
            'exec': ('executable', self.valid_step),
            'general': ('general_correct', self.correct_step),
            'num': ('numeric_correct', self.correct_step),
            'text': ('text_score', self.text_step),
            'vis': ('vis_sim', self.vis_similarity_step),
        }
    def check_user_data_dir(self, user_data_dir):
        if user_data_dir == 'ENV':
            user_data_dir = os.environ.get('USER_DATA_DIR', '')
-        self.user_data_dir = user_data_dir
+        user_data_dir = user_data_dir.rstrip('/')
        basename = osp.basename(user_data_dir)
        if basename and basename != 'data':
            user_data_dir = osp.join(user_data_dir, 'data')
            assert osp.exists(user_data_dir), \
                f'a subfolder named `data` should exist under {user_data_dir}.'
        elif basename:
            assert osp.exists(user_data_dir), \
                f'{user_data_dir} does not exist.'
        return user_data_dir
    @staticmethod
    def valid_step(step):
@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
        # Fall back to False
        return False
    def text_step(self, step, target):
        """Whether the step output is correct."""
        # Found the latest code interpreter to determine correct
        for action in step[::-1]:
            if action['type'] == 'IPythonInterpreter':
                if action['result']:
                    try:
                        pred = action['result']['text']
                        match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
                        if match:
                            out = match.group(1)
                            score = self.text_evaluator.score([out], [target])
                            return score[self.text_eval_metric] / 100
                    except Exception:
                        return False
        # Fall back to False
        return False
    @staticmethod
    def vis_similarity_step(step, target):
        """Whether the step output image has the same structure similarity with
@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
                'the conversion processes.')
        check_jupytext()
        p_list = []
        from opencompass.lagent.actions.ipython_interpreter import extract_code
        for idx, (example_origin_prompt,
                  example_steps) in enumerate(zip(origin_prompt, steps)):
@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
                f.writelines(markdown_lines)
            # TODO: be careful for this
            # The result might be different with infer process
            # please check carefully
            # convert markdown to ipynb and exectue with error tolerance
-            # subprocess.Popen(
+            if self.with_ipynb:
-            #     "jupytext --to ipynb --pipe-fmt ipynb "
+                p = subprocess.Popen(
-            #     "--pipe 'jupyter nbconvert --to ipynb --execute "
+                    'jupytext --to ipynb --pipe-fmt ipynb '
-            #     f"--allow-errors --stdin --stdout' {md_file}",
+                    "--pipe 'jupyter nbconvert --to ipynb --execute "
-            #     shell=True)
+                    f"--allow-errors --stdin --stdout' {md_file}",
                    shell=True)
                p_list.append(p)
        # TODO: async wait
        for p in p_list:
            p.wait()
    def set_data_dir(self, work_dir):
        """Set work directory and link data files for save notebook results."""
        if self.user_data_dir:
-            if self.user_data_dir.endswith('/'):
+            basename = osp.basename(self.user_data_dir)
-                basename = osp.basename(osp.split(self.user_data_dir)[0])
+
            else:
                basename = osp.basename(self.user_data_dir)
            if not osp.exists(osp.join(self.output_dir, basename)):
                os.symlink(self.user_data_dir,
                           osp.join(self.output_dir, basename))
@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
        """Change work directory and keep the symlink."""
        os.chdir(work_dir)
    def single_exp(self, gold, steps):
        tags = gold['tags']
        outputs = gold['outputs']
        metadata = gold['metadata']
        hard_tags = metadata.get('step_types', [])
        if hard_tags:
            tags = hard_tags
        # executable: exec succeed
        # general_correct: general correct
        # numeric_correct: numerical correct
        # text_score: text score
        # vis_sim: visual similarity
        result = defaultdict(list)
        for tag, step, output in zip(tags, steps, outputs):
            # check whether this step is valid
            result['executable'].append(self.valid_step(step))
            if tag != 'exec':
                key, func = self.TAG_MAPPING[tag]
                result[key].append(func(step, output))
        # add missing metric for better analyse if not exists
        if hard_tags:
            check_tags = ['exec', 'num', 'text', 'vis']
        else:
            check_tags = ['exec', 'general', 'vis']
        for tag in check_tags:
            key = self.TAG_MAPPING[tag][0]
            if key not in result:
                result[key] = []
        return result
    def get_output_dir(self):
        """Get output dir from eval task.
        Notice: output dir should be in format xxx/data.
        All the needed files should be
        """
        # hard hack for get output dir from eval task
        if hasattr(self, '_out_dir') and self.output_dir is None:
            self.output_dir = self._out_dir
    def score(self, predictions: List, references: List, steps: List,
              origin_prompt: List):
        """Calculate accuracy."""
        cwd = os.getcwd()
        self.get_output_dir()
        if self.output_dir:
            if not osp.exists(self.output_dir):
                os.makedirs(self.output_dir)
@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
            self.save_results(origin_prompt, steps)
            self.unset_data_dir(cwd)
-        num_cells_list = []
+        total_results = defaultdict(float)
-        num_general_list = []
+        total_scores = defaultdict(float)
-        passed_list = []
+        total_nums = defaultdict(int)
        correct_list = []
        vis_list = []
        for gold, single_steps in zip(references, steps):
-            tags = gold['tags']
+            result = self.single_exp(gold, single_steps)
            outputs = gold['outputs']
            num_cells = len(tags)
            num_general = sum([tag == 'general' for tag in tags])
-            passed = sum([self.valid_step(step) for step in single_steps])
+            for k, v in result.items():
-            correct = 0
+                total_scores[k] += sum(v)
-            vis_sim = []
+                total_nums[k] += len(v)
            for tag, step, output in zip(tags, single_steps, outputs):
                if tag == 'general':
                    correct += self.correct_step(step, output)
                elif tag == 'vis':
                    vis_sim.append(self.vis_similarity_step(step, output))
-            num_cells_list.append(num_cells)
+        for k, v in total_scores.items():
-            num_general_list.append(num_general)
+            if total_nums[k] > 0:
-            passed_list.append(passed)
+                total_results[k] = total_scores[k] / total_nums[k] * 100
            correct_list.append(correct)
            if vis_sim:
                vis_list.append(sum(vis_sim) / len(vis_sim))
            else:
-                vis_list.append(-1)
+                total_results[k] = -1
-        if len([v for v in vis_list if v >= 0]) > 0:
+        return total_results
            visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
                [v for v in vis_list if v >= 0])
        else:
            # not valid
            visualize_similarity = -1
        if sum(num_general_list) > 0:
            general_accuracy = sum(correct_list) / sum(num_general_list)
        else:
            # not valid
            general_accuracy = -1
        result = dict(
            executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
            general_accuracy=general_accuracy * 100,
            visualize_similarity=visualize_similarity * 100,
            num_cells_list=num_cells_list,
            num_general_list=num_general_list,
            passed_list=passed_list,
            correct_list=correct_list,
            vis_list=vis_list,
        )
        return result
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                if line['label'] == '-':
                    continue
                data.append(line)
        return Dataset.from_list(data)
--- a/opencompass/datasets/ds1000.py
+++ b/opencompass/datasets/ds1000.py
@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
    return text
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
 def ds1000_completion_postprocess(text: str) -> str:
    text += '</code>'
    match = re.search('(.*?)</code>', text, re.DOTALL)
    if match:
        text = match.group(1)
    return text
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
 def ds1000_matplotlib_postprocess(text: str) -> str:
    text = ds1000_postprocess(text)
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
            reasoning_acc=100 *
            (reasoning_scope + final_scope + row_reasoning_scope) / total,
            code_acc=100 * (code_scope + final_scope) / total,
-            action_acc=100 * (action_scope + final_scope) / total,
+            action_pct=100 * (action_scope + final_scope) / total,
        )
        return result
--- a/opencompass/datasets/wikibench.py
+++ b/opencompass/datasets/wikibench.py
@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
        circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
        data = []
-        with open(path, 'r') as infile:
+        with open(path, 'r', encoding='utf-8') as infile:
            for id, line in enumerate(infile):
                entry = json.loads(line)
                if 'cloze' in name:
--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
            for line in f:
                line = json.loads(line)
                prompt = line['sentence']
-                dataset_list.append({
+                continue_prompt = prompt.split('_')
-                    'opt1':
+                data_item = {
-                    prompt.replace('_', line['option1']),
+                    'opt1': prompt.replace('_', line['option1']),
-                    'opt2':
+                    'opt2': prompt.replace('_', line['option2']),
-                    prompt.replace('_', line['option2']),
+                    'answer': line['answer'],
-                    'answer':
+                    'cont': continue_prompt[1]
-                    line['answer']
+                }
-                })
+                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
                prompt = line['sentence']
                answer = line['answer']
                answer = ' AB'[int(answer)] if answer != '' else 'NULL'
-                dataset_list.append({
+                data_item = {
-                    'opt1':
+                    'opt1': prompt.replace('_', line['option1']),
-                    prompt.replace('_', line['option1']),
+                    'opt2': prompt.replace('_', line['option2']),
-                    'opt2':
+                    'answer': answer,
-                    prompt.replace('_', line['option2']),
+                }
-                    'answer':
+                dataset_list.append(data_item)
                    answer
                })
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
            it is disabled. Defaults to None.
        timeout (int): Upper bound of waiting time for Python script execution.
            Defaults to 20.
        trim_output (int, optional): Max characters restriction of ipython
            outputs. If None, do not perform any trim.
            TODO: Notice that, this is not token len. Anf trim strategies
            might be added later. Defaults to 1024.
        user_data_dir (str): Specified the user data directory for files
            loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
            Defaults to `ENV`.
@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
                 enable: bool = True,
                 disable_description: Optional[str] = None,
                 timeout: int = 20,
                 trim_output: Optional[int] = 1024,
                 user_data_dir: str = 'ENV') -> None:
        super().__init__(description, name, enable, disable_description)
@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
            user_data_dir = os.environ.get('USER_DATA_DIR', '')
        if user_data_dir:
-            user_data_dir = os.path.dirname(user_data_dir)
+            # user_data_dir = os.path.dirname(user_data_dir)
            user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
        self.user_data_dir = user_data_dir
        self._initialized = False
        self.trim_output = trim_output
        if not os.path.exists(WORK_DIR):
            os.mkdir(WORK_DIR)
@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
                if image:
                    result += f'\n\n{image}'
                if finished:
                    # in case output text too long
                    # might need better design later
                    if self.trim_output and len(result) > self.trim_output:
                        ellip = '......'
                        half_len = int((self.trim_output - len(ellip)) / 2)
                        result = result[:half_len] + ellip + result[-half_len:]
                    return succeed, result
        try:
@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
                 command: str,
                 timeout: Optional[int] = None) -> ActionReturn:
        tool_return = ActionReturn(url=None, args=None, type=self.name)
-        tool_return.args = dict(text=command)
+        extracted_command = extract_code(command)
-        succeed, result = self._call(command, timeout)
+        tool_return.args = dict(text=command, extract_code=extracted_command)
-        if succeed:
+        if extracted_command:
-            tool_return.result = dict(text=result)
+            succeed, result = self._call(extracted_command, timeout)
-            tool_return.state = ActionStatusCode.SUCCESS
+            if succeed:
                if not result:
                    result = 'The code is succeed without any outputs.'
                tool_return.result = dict(text=result)
                tool_return.state = ActionStatusCode.SUCCESS
            else:
                tool_return.errmsg = repr(result)
                tool_return.state = ActionStatusCode.API_ERROR
        else:
-            tool_return.errmsg = repr(result)
+            tool_return.errmsg = 'The input code is empty. Please follow the format.'  # noqa
            tool_return.state = ActionStatusCode.API_ERROR
        return tool_return
--- a/opencompass/models/base.py
+++ b/opencompass/models/base.py
@ -115,6 +115,20 @@ class BaseModel:
        inputs = self.parse_template(templates, mode='ppl')
        return self.get_ppl(inputs, mask_length)
    def get_loglikelihood_from_template(self,
                                        templates: List[PromptType],
                                        conts: List[str],
                                        mask_length=None):
        """Get perplexity given a list of templates.
        Args:
            templates (List[PromptType]): A list of templates.
            mask_length (List[int]): A list of mask lengths. If provided, the
                perplexity will be calculated only on the unmasked tokens.
        """
        inputs = self.parse_template(templates, mode='ppl')
        return self.get_loglikelihood(inputs, conts, mask_length)
    def generate_from_template(self, templates: List[PromptType],
                               max_out_len: int, **kwargs):
        """Generate completion from a list of templates.
--- a/opencompass/models/base_api.py
+++ b/opencompass/models/base_api.py
@ -1,9 +1,11 @@
 import re
 import sys
 import threading
 import time
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
 from queue import Queue
 from time import sleep
 from typing import Dict, List, Optional, Tuple, Union
@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
    def __init__(self,
                 path: str,
                 query_per_second: int = 1,
                 rpm_verbose: bool = False,
                 retry: int = 2,
                 max_seq_len: int = 2048,
                 meta_template: Optional[Dict] = None,
@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
        self.meta_template = meta_template
        self.retry = retry
        self.query_per_second = query_per_second
-        self.token_bucket = TokenBucket(query_per_second)
+        self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
        self.template_parser = APITemplateParser(meta_template)
        self.logger = get_logger()
        self.generation_kwargs = generation_kwargs
@ -422,10 +425,13 @@ class TokenBucket:
        query_per_second (float): The rate of the token bucket.
    """
-    def __init__(self, rate):
+    def __init__(self, rate, verbose=False):
        self._rate = rate
        self._tokens = threading.Semaphore(0)
        self.started = False
        self._request_queue = Queue()
        self.logger = get_logger()
        self.verbose = verbose
    def _add_tokens(self):
        """Add tokens to the bucket."""
@ -440,3 +446,12 @@ class TokenBucket:
            self.started = True
            threading.Thread(target=self._add_tokens, daemon=True).start()
        self._tokens.acquire()
        if self.verbose:
            cur_time = time.time()
            while not self._request_queue.empty():
                if cur_time - self._request_queue.queue[0] > 60:
                    self._request_queue.get()
                else:
                    break
            self._request_queue.put(cur_time)
            self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
 import numpy as np
 import torch
 import transformers
 from opencompass.models.base import BaseModel
 from opencompass.models.base_api import APITemplateParser
@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
 PromptType = Union[PromptList, str]
 class MultiTokenEOSCriteria(transformers.StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence."""
    def __init__(
        self,
        sequence: str,
        tokenizer: transformers.PreTrainedTokenizer,
        batch_size: int,
    ):
        self.done_tracker = [False] * batch_size
        self.sequence = sequence
        self.sequence_ids = tokenizer.encode(sequence,
                                             add_special_tokens=False)
        self.sequence_id_len = len(self.sequence_ids)
        self.tokenizer = tokenizer
    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # compare the last len(stop) tokens
        lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
        for i, done in enumerate(self.done_tracker):
            if done:
                continue
            self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
        return False not in self.done_tracker
@MODELS.register_module()
 class HuggingFace(BaseModel):
    """Model wrapper around HuggingFace models.
@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
            self.model.config.eos_token_id = 2
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
-    def generate(self, inputs: List[str], max_out_len: int,
+    def generate(self,
                 inputs: List[str],
                 max_out_len: int,
                 stopping_criteria: List[str] = [],
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.
@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
                                        max_out_len=max_out_len,
                                        **generation_kwargs)
        else:
-            return sum((self._single_generate(
+            return sum(
-                inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
+                (self._single_generate(inputs=[input_],
-                        for input_ in inputs), [])
+                                       max_out_len=max_out_len,
                                       stopping_criteria=stopping_criteria,
                                       **generation_kwargs)
                 for input_ in inputs), [])
    def _batch_generate(self, inputs: List[str], max_out_len: int,
                        **kwargs) -> List[str]:
@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
            decodeds = [token.split(self.end_str)[0] for token in decodeds]
        return decodeds
-    def _single_generate(self, inputs: List[str], max_out_len: int,
+    def _single_generate(self,
                         inputs: List[str],
                         max_out_len: int,
                         stopping_criteria: List[str] = [],
                         **kwargs) -> List[str]:
        """Support for single prompt inference.
@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
                                   max_length=self.max_seq_len -
                                   max_out_len)['input_ids']
        input_ids = torch.tensor(input_ids, device=self.model.device)
        if stopping_criteria:
            # Construct huggingface stopping criteria
            stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
            stopping_criteria = transformers.StoppingCriteriaList([
                *[
                    MultiTokenEOSCriteria(sequence, self.tokenizer,
                                          input_ids.shape[0])
                    for sequence in stopping_criteria
                ],
            ])
            kwargs['stopping_criteria'] = stopping_criteria
        # To accommodate the PeftModel, parameters should be passed in
        # key-value format for generate.
        outputs = self.model.generate(input_ids=input_ids,
@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
        return ce_loss
    def get_loglikelihood(
            self,
            inputs: List[str],
            conts: List[str],
            mask_length: Optional[List[int]] = None) -> List[float]:
        """Get loglikelihood scores given a list of inputs.
        Args:
            inputs (List[str]): A list of strings.
            conts (List[str]): A list of strings: slices after the space.
            NOT SUPPORT mask_length YET!
            mask_length (Optional[List[int]]): A list of mask lengths. If
                provided, the perplexity scores will be calculated with the
                first mask_length[i] tokens masked out. It's okay to skip
                its implementation if advanced features in PPLInfernecer is
                not needed.
        Returns:
            List[float]: A list of loglikelihood scores.
        """
        assert mask_length is None, 'Not support mask_length yet.'
        if self.batch_padding and len(inputs) > 1:
            raise NotImplementedError('Batch padding is not supported yet.')
            # assert self.tokenizer.pad_token
            # return self._get_loglikelihood(inputs, mask_length=mask_length)
        return np.array([
            self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
            for idx in range(len(inputs))
        ])
    def _get_loglikelihood(self, inputs: str, conts: str) -> float:
        """Get loglikelihood scores given input string and continuation string.
        Args:
            inputs (str): string.
            conts (str): strings: slices after the space.
        Returns:
            float: loglikelihood scores.
        """
        input_ids = self.tokenizer(inputs,
                                   padding=False,
                                   truncation=True,
                                   max_length=self.max_seq_len)['input_ids']
        input_ids = torch.tensor(input_ids, device=self.model.device)
        context_ids = self.tokenizer(inputs.replace(conts, ''),
                                     padding=False,
                                     truncation=True,
                                     max_length=self.max_seq_len)['input_ids']
        cont_ids = input_ids[len(context_ids):]
        output = self.model(input_ids.unsqueeze(0))
        logits = output['logits'][:, :-1]
        logits = torch.nn.functional.log_softmax(logits, dim=-1)
        contlen = cont_ids.shape[0]
        logits = logits[:, -contlen:, :]
        # Reducing the dimension will lead to a wrong outcome
        logits_gather = torch.gather(
            logits, 2,
            cont_ids.unsqueeze(0).unsqueeze(-1))  # [1, seq]
        # Answer: sum the likelihood of each token in continuation
        answer = float(logits_gather.detach().cpu().sum())
        return answer
    def get_token_len(self, prompt: str) -> int:
        """Get lengths of the tokenized strings.
@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
                        'role': {
                            'HUMAN': 'user',
                            'BOT': 'assistant',
-                            'SYSTEM': 'system'
+                            'SYSTEM': 'system',
-                        }[item['role']]
+                        }[item['role'].upper()]
                    }
                    history.append(msg)
            user_content = history[-1]['content']
@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
                response, history = self.model.chat(self.tokenizer,
                                                    user_content,
                                                    history=history)
                # response will be dict sometime
                if isinstance(response, dict):
                    response = response.get('content', '')
                responses.append(response)
            except Exception:
                responses.append('')
--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@ -52,7 +52,7 @@ class LagentAgent:
    def chat(self,
             user_input: str,
-             history: List[dict] = None) -> Tuple[str, List[dict]]:
+             history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
        """Chat with agent."""
        if history:
            self.agent._session_history = history
@ -60,6 +60,7 @@ class LagentAgent:
        from lagent.schema import ActionReturn, AgentReturn
        generation: AgentReturn = self.agent.chat(user_input)
        inner_steps = generation.inner_steps
        answer = generation.response
        steps = []
@ -76,7 +77,7 @@ class LagentAgent:
                    valid=int(step.valid),
                ))
-        return answer, steps
+        return answer, steps, inner_steps
 FORCE_STOP_PROMPT_EN = (
--- a/opencompass/models/llama2.py
+++ b/opencompass/models/llama2.py
@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
                dialog = []
                for item in input:
                    msg = {'content': item['prompt']}
-                    if item['role'] == 'HUMAN':
+                    if item['role'].upper() == 'HUMAN':
                        msg['role'] = 'user'
-                    elif item['role'] == 'BOT':
+                    elif item['role'].upper() == 'BOT':
                        msg['role'] = 'assistant'
-                    elif item['role'] == 'SYSTEM':
+                    elif item['role'].upper() == 'SYSTEM':
                        msg['role'] = 'system'
                    else:
                        raise ValueError(f'Unknown role: {item["role"]}')
                    dialog.append(msg)
            dialogs.append(dialog)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
                 path: str = 'gpt-3.5-turbo',
                 max_seq_len: int = 4096,
                 query_per_second: int = 1,
                 rpm_verbose: bool = False,
                 retry: int = 2,
                 key: Union[str, List[str]] = 'ENV',
                 org: Optional[Union[str, List[str]]] = None,
@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
                         max_seq_len=max_seq_len,
                         meta_template=meta_template,
                         query_per_second=query_per_second,
                         rpm_verbose=rpm_verbose,
                         retry=retry)
        import tiktoken
        self.tiktoken = tiktoken
--- a/opencompass/openicl/icl_evaluator/init.py
+++ b/opencompass/openicl/icl_evaluator/init.py
@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
@ -0,0 +1,11 @@
 from opencompass.registry import ICL_EVALUATORS
 from .icl_base_evaluator import BaseEvaluator
@ICL_EVALUATORS.register_module()
 class AveragePPLEvaluator(BaseEvaluator):
    def score(self, ppl):
        average_ppl = sum(ppl) / len(ppl)
        return {'average_ppl': average_ppl}
--- a/opencompass/openicl/icl_inferencer/init.py
+++ b/opencompass/openicl/icl_inferencer/init.py
@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer  # noqa
 from .icl_chat_inferencer import ChatInferencer  # noqa
 from .icl_clp_inferencer import CLPInferencer  # noqa
 from .icl_gen_inferencer import GenInferencer  # noqa
 from .icl_loglikelihood_inferencer import LoglikelihoodInferencer  # noqa
 from .icl_ppl_inferencer import PPLInferencer  # noqa
 from .icl_ppl_only_inferencer import PPLOnlyInferencer  # noqa
 from .icl_sc_inferencer import SCInferencer  # noqa
 from .icl_tot_inferencer import ToTInferencer  # noqa
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):
        user_idx = assistant_indices[-1] - 1
        self.model.set_history(chat[:user_idx])
-        answer, steps = self.model.chat(chat[user_idx]['content'])
+        answer, steps, _ = self.model.chat(chat[user_idx]['content'])
        output_handler.save_results(
            origin_prompt=chat[user_idx]['content'],
            prediction=answer,
@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
            i for i, item in enumerate(chat) if item['role'] == 'assistant'
        ]
-        self.model.set_history(chat[:assistant_indices[0] - 1])
+        history = chat[:assistant_indices[0] - 1]
        for i in assistant_indices:
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, inner_steps = self.model.chat(
                chat[i - 1]['content'], history)
            history += inner_steps
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,
@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):
        for i in assistant_indices:
            self.model.set_history(chat[:i - 1])
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, _ = self.model.chat(chat[i - 1]['content'])
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,
--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@ -68,11 +68,11 @@ class LMTemplateParser:
        prompt = ''
        if self.roles:
            for dialog in chat:
-                role_cfg = self.roles.get(dialog['role'])
+                role_cfg = self.roles.get(dialog['role'], {})
-                prompt += role_cfg['begin']
+                prompt += (role_cfg.get('begin') or '')
                prompt += (dialog.get('content') or '')
-                prompt += role_cfg['end']
+                prompt += (role_cfg.get('end') or '')
-            prompt += self.roles['assistant']['begin']
+            prompt += (self.roles['assistant'].get('begin') or '')
        else:
            # in case the model does not have any meta template
            last_sep = ''
@ -227,9 +227,13 @@ class ChatInferencer(BaseInferencer):
                                         'tmp_' + output_json_filename)
        if osp.exists(tmp_json_filepath):
            # TODO: move resume to output handler
-            tmp_result_dict = mmengine.load(tmp_json_filepath)
+            try:
-            output_handler.results_dict = tmp_result_dict
+                tmp_result_dict = mmengine.load(tmp_json_filepath)
-            index = len(tmp_result_dict)
+            except Exception:
                pass
            else:
                output_handler.results_dict = tmp_result_dict
                index = len(tmp_result_dict)
        # 4. Wrap prompts with Dataloader
        dataloader = self.get_dataloader(chat_list[index:], batch_size=1)
--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
@ -1,5 +1,6 @@
 """Direct Generation Inferencer."""
 import inspect
 import os
 import os.path as osp
 from typing import List, Optional
@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
            self,
            model: BaseModel,
            max_out_len: int,
            stopping_criteria: List[str] = [],
            max_seq_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            gen_field_replace_token: Optional[str] = '',
@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):
        self.gen_field_replace_token = gen_field_replace_token
        self.max_out_len = max_out_len
        self.stopping_criteria = stopping_criteria
        if self.model.is_api and save_every is None:
            save_every = 1
@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
                entry = datum
                golds = [None for _ in range(len(entry))]
            # 5-1. Inference with local model
            extra_gen_kwargs = {}
            sig = inspect.signature(self.model.generate)
            if 'stopping_criteria' in sig.parameters:
                extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
            with torch.no_grad():
                parsed_entries = self.model.parse_template(entry, mode='gen')
                results = self.model.generate_from_template(
-                    entry, max_out_len=self.max_out_len)
+                    entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
                generated = results
            num_return_sequences = getattr(self.model, 'generation_kwargs',
--- a/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
@ -0,0 +1,215 @@
 """PPL Inferencer."""
 import os
 from typing import List, Optional
 import torch
 from tqdm import trange
 from opencompass.models.base import BaseModel
 from opencompass.registry import ICL_INFERENCERS
 from ..icl_prompt_template import PromptTemplate
 from ..icl_retriever import BaseRetriever
 from ..utils import get_logger
 from .icl_base_inferencer import BaseInferencer, dump_results_dict
 logger = get_logger(__name__)
@ICL_INFERENCERS.register_module()
 class LoglikelihoodInferencer(BaseInferencer):
    """Loglikelihood Inferencer class to evaluate by loglikelihood.
    Attributes:
        model (:obj:`BaseModel`, optional): The module to inference.
        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
            the LM.
        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
        output_json_filepath (:obj:`str`, optional): File path for output
            `JSON` file.
        output_json_filename (:obj:`str`, optional): File name for output
            `JSON` file.
        labels (:obj:`List`, optional): A list of labels for all classes.
    """
    def __init__(
            self,
            model: BaseModel,
            max_seq_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            output_json_filepath: Optional[str] = './icl_inference_output',
            output_json_filename: Optional[str] = 'predictions',
            labels: Optional[List] = None,
            **kwargs) -> None:
        super().__init__(
            model=model,
            max_seq_len=max_seq_len,
            batch_size=batch_size,
            output_json_filename=output_json_filename,
            output_json_filepath=output_json_filepath,
            **kwargs,
        )
        self.labels = labels
    def inference(self,
                  retriever: BaseRetriever,
                  ice_template: Optional[PromptTemplate] = None,
                  prompt_template: Optional[PromptTemplate] = None,
                  output_json_filepath: Optional[str] = None,
                  output_json_filename: Optional[str] = None) -> List:
        # 1. Preparation for output logs
        output_handler = LoglikelihoodInferencerOutputHandler()
        sub_predictions = []
        ppl = []
        ice = []
        if output_json_filepath is None:
            output_json_filepath = self.output_json_filepath
        if output_json_filename is None:
            output_json_filename = self.output_json_filename
        # 2. Get results of retrieval process
        ice_idx_list = retriever.retrieve()
        # 3. Get labels of all the classes
        if self.labels is None:
            labels = retriever.get_labels(ice_template=ice_template,
                                          prompt_template=prompt_template)
        else:
            labels = self.labels
        # 4. Generate in-context examples for testing inputs
        for idx in range(len(ice_idx_list)):
            ice.append(
                retriever.generate_ice(ice_idx_list[idx],
                                       ice_template=ice_template))
        output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
        # 5. Calculating loglikelihood for prompts in each label's class
        for label in labels:
            index = 0
            prompt_list = []
            sub_ppl_list = []
            token_num_list = []
            cont_list = []
            # 5.1 Generate prompts of current label and truncate
            # TODO: Refactor
            for idx in range(len(ice_idx_list)):
                prompt = retriever.generate_label_prompt(
                    idx,
                    ice[idx],
                    label,
                    ice_template=ice_template,
                    prompt_template=prompt_template)
                if self.max_seq_len is not None:
                    prompt_token_num = self.model.get_token_len_from_template(
                        prompt, mode='ppl')
                    while len(ice_idx_list[idx]
                              ) > 0 and prompt_token_num > self.max_seq_len:
                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
                        ice[idx] = retriever.generate_ice(
                            ice_idx_list[idx], ice_template=ice_template)
                        prompt = retriever.generate_label_prompt(
                            idx,
                            ice[idx],
                            label,
                            ice_template=ice_template,
                            prompt_template=prompt_template)
                        prompt_token_num = self.model.get_token_len_from_template(  # noqa
                            prompt, mode='ppl')  # noqa
                prompt_list.append(prompt)
                token_num_list.append(prompt_token_num)
                cont_list.append(retriever.test_ds[idx]['cont'])
            # 5.2 Get PPL
            logger.info(f"Calculating PPL for prompts labeled '{label}'")
            for idx in trange(0,
                              len(prompt_list),
                              self.batch_size,
                              disable=not self.is_main_process):
                sub_prompt_list = prompt_list[idx:idx + self.batch_size]
                sub_cont_list = cont_list[idx:idx + self.batch_size]
                with torch.no_grad():
                    # mainly modify compared to PPLInferencer
                    sub_res = self.model.get_loglikelihood_from_template(
                        sub_prompt_list, sub_cont_list).tolist()
                for res, prompt in zip(
                        sub_res,
                        self.model.parse_template(sub_prompt_list,
                                                  mode='ppl')):
                    sub_ppl_list.append(res)
                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
                    output_handler.save_prompt_and_loglikelihood(
                        label, prompt.replace(ice_str, ''), prompt, res, index)
                    index = index + 1
            ppl.append(sub_ppl_list)
        # 6. Get lowest PPL class as predictions
        ppl = list(zip(*ppl))
        for single_ppl in ppl:
            sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
        output_handler.save_predictions(sub_predictions)
        # 7. Fetch gold answers if exist
        ds_reader = retriever.dataset_reader
        if ds_reader.output_column:
            golds = ds_reader.dataset['test'][ds_reader.output_column]
            output_handler.save_golds(golds)
        # 8. Output
        if self.is_main_process:
            os.makedirs(output_json_filepath, exist_ok=True)
            output_handler.write_to_json(output_json_filepath,
                                         output_json_filename)
        return [
            sample['prediction']
            for sample in output_handler.results_dict.values()
        ]
 class LoglikelihoodInferencerOutputHandler:
    results_dict = {}
    def __init__(self) -> None:
        self.results_dict = {}
    def write_to_json(self, save_dir: str, filename: str):
        """Dump the result to a json file."""
        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
    def save_ice(self, ice):
        for idx, example in enumerate(ice):
            if str(idx) not in self.results_dict.keys():
                self.results_dict[str(idx)] = {}
            self.results_dict[str(idx)]['in-context examples'] = example
    def save_predictions(self, predictions):
        for idx, prediction in enumerate(predictions):
            if str(idx) not in self.results_dict.keys():
                self.results_dict[str(idx)] = {}
            self.results_dict[str(idx)]['prediction'] = prediction
    def save_prompt_and_loglikelihood(self, label, input, prompt,
                                      loglikelihood, idx):
        if str(idx) not in self.results_dict.keys():
            self.results_dict[str(idx)] = {}
        if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
            self.results_dict[str(idx)]['label: ' + str(label)] = {}
        self.results_dict[str(idx)]['label: ' +
                                    str(label)]['testing input'] = input
        self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
        self.results_dict[str(idx)][
            'label: ' + str(label)]['Loglikelihood'] = loglikelihood
    def save_golds(self, golds):
        for idx, gold in enumerate(golds):
            if str(idx) not in self.results_dict.keys():
                self.results_dict[str(idx)] = {}
            self.results_dict[str(idx)]['gold'] = gold
--- a/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
@ -0,0 +1,188 @@
 """PPL Inferencer."""
 import os
 from typing import List, Optional
 import mmengine
 import torch
 from tqdm import tqdm
 from opencompass.models.base import BaseModel
 from opencompass.registry import ICL_INFERENCERS
 from ..icl_prompt_template import PromptTemplate
 from ..icl_retriever import BaseRetriever
 from ..utils import get_logger
 from .icl_base_inferencer import BaseInferencer, dump_results_dict
 logger = get_logger(__name__)
@ICL_INFERENCERS.register_module()
 class PPLOnlyInferencer(BaseInferencer):
    """PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
    made. This Inferencer is usually used along with AveragePPLEvaluator.
    Attributes:
        model (:obj:`BaseModel`, optional): The module to inference.
        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
            the LM.
        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
        output_json_filepath (:obj:`str`, optional): File path for output
            `JSON` file.
        output_json_filename (:obj:`str`, optional): File name for output
            `JSON` file.
        save_every (:obj:`int`, optional): Save intermediate results every
    """
    def __init__(
            self,
            model: BaseModel,
            max_seq_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            output_json_filepath: Optional[str] = './icl_inference_output',
            output_json_filename: Optional[str] = 'predictions',
            save_every: Optional[int] = 1,
            **kwargs) -> None:
        super().__init__(
            model=model,
            max_seq_len=max_seq_len,
            batch_size=batch_size,
            output_json_filename=output_json_filename,
            output_json_filepath=output_json_filepath,
            **kwargs,
        )
        self.save_every = save_every
    def inference(self,
                  retriever: BaseRetriever,
                  ice_template: Optional[PromptTemplate] = None,
                  prompt_template: Optional[PromptTemplate] = None,
                  output_json_filepath: Optional[str] = None,
                  output_json_filename: Optional[str] = None) -> List:
        # 1. Preparation for output logs
        output_handler = PPLOnlyInferencerOutputHandler()
        if output_json_filepath is None:
            output_json_filepath = self.output_json_filepath
        if output_json_filename is None:
            output_json_filename = self.output_json_filename
        # 2. Get results of retrieval process
        ice_idx_list = retriever.retrieve()
        # 3. Generate prompts for testing input
        prompt_list = self.get_generation_prompt_list_from_retriever_indices(
            ice_idx_list,
            retriever,
            max_seq_len=self.max_seq_len,
            ice_template=ice_template,
            prompt_template=prompt_template)
        # 3.1 Fetch and zip prompt & gold answer if output column exists
        ds_reader = retriever.dataset_reader
        assert ds_reader.output_column is None, (
            'PPLOnlyInferencer supports `output_column=None` only.')
        # Create tmp json file for saving intermediate results and future
        # resuming
        index = 0
        tmp_json_filepath = os.path.join(output_json_filepath,
                                         'tmp_' + output_json_filename)
        if os.path.exists(tmp_json_filepath):
            # TODO: move resume to output handler
            try:
                tmp_result_dict = mmengine.load(tmp_json_filepath)
            except Exception:
                pass
            else:
                output_handler.results_dict = tmp_result_dict
                index = len(tmp_result_dict)
        # 4. Wrap prompts with Dataloader
        dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
        # 5. Inference for prompts in each batch
        logger.info('Starting inference process...')
        for datum in tqdm(dataloader, disable=not self.is_main_process):
            entry = datum
            # 5-1. Inference with local model
            with torch.no_grad():
                ppls = self.model.get_ppl_from_template(entry).tolist()
            parsed_entries = self.model.parse_template(entry, mode='gen')
            # 5-3. Save current output
            for prompt, ppl, in zip(parsed_entries, ppls):
                output_handler.save_results(prompt, ppl, index)
                index = index + 1
            # 5-4. Save intermediate results
            if (self.save_every is not None and index % self.save_every == 0
                    and self.is_main_process):
                output_handler.write_to_json(output_json_filepath,
                                             'tmp_' + output_json_filename)
        # 6. Output
        if self.is_main_process:
            os.makedirs(output_json_filepath, exist_ok=True)
            output_handler.write_to_json(output_json_filepath,
                                         output_json_filename)
            if os.path.exists(tmp_json_filepath):
                os.remove(tmp_json_filepath)
        return [
            sample['ppl'] for sample in output_handler.results_dict.values()
        ]
    def get_generation_prompt_list_from_retriever_indices(
            self,
            ice_idx_list: List[List[int]],
            retriever: BaseRetriever,
            max_seq_len: Optional[int] = None,
            ice_template: Optional[PromptTemplate] = None,
            prompt_template: Optional[PromptTemplate] = None):
        prompt_list = []
        for idx, ice_idx in enumerate(ice_idx_list):
            ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
            prompt = retriever.generate_prompt_for_generate_task(
                idx,
                ice,
                ice_template=ice_template,
                prompt_template=prompt_template)
            if max_seq_len is not None:
                prompt_token_num = self.model.get_token_len_from_template(
                    prompt, mode='gen')
                while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
                    ice_idx = ice_idx[:-1]
                    ice = retriever.generate_ice(ice_idx,
                                                 ice_template=ice_template)
                    prompt = retriever.generate_prompt_for_generate_task(
                        idx,
                        ice,
                        ice_template=ice_template,
                        prompt_template=prompt_template)
                    prompt_token_num = self.model.get_token_len_from_template(
                        prompt, mode='gen')
            prompt_list.append(prompt)
        return prompt_list
 class PPLOnlyInferencerOutputHandler:
    origin_prompt_dict = {}
    output_dict = {}
    results_dict = {}
    def __init__(self) -> None:
        self.results_dict = {}
    def write_to_json(self, save_dir: str, filename: str):
        """Dump the result to a json file."""
        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
    def save_results(self, origin_prompt, ppl, idx):
        self.results_dict[str(idx)] = {
            'origin_prompt': origin_prompt,
            'ppl': ppl,
        }
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@ -1,10 +1,12 @@
 import inspect
 from abc import abstractmethod
 from copy import deepcopy
 from typing import Dict, List, Optional
 from mmengine.config import ConfigDict
-from opencompass.utils import get_logger, task_abbr_from_cfg
+from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
                               model_abbr_from_cfg, task_abbr_from_cfg)
 class BasePartitioner:
@ -54,8 +56,7 @@ class BasePartitioner:
            List[Dict]: A list of tasks.
        """
        cfg = deepcopy(cfg)
-        models = cfg['models']
+
        datasets = cfg['datasets']
        work_dir = cfg['work_dir']
        add_cfg = {}
@ -74,10 +75,11 @@ class BasePartitioner:
                self.logger.debug(f'Key {k} not found in config, ignored.')
        self.logger.debug(f'Additional config: {add_cfg}')
-        tasks = self.partition(models,
+        model_and_dataset_args = self.parse_model_dataset_args(cfg)
-                               datasets,
+
-                               work_dir,
+        tasks = self.partition(**model_and_dataset_args,
-                               self.out_dir,
+                               work_dir=work_dir,
                               out_dir=self.out_dir,
                               add_cfg=add_cfg)
        self.logger.info(f'Partitioned into {len(tasks)} tasks.')
@ -86,6 +88,41 @@ class BasePartitioner:
        return tasks
    def parse_model_dataset_args(self, cfg: ConfigDict):
        models = cfg['models']
        datasets = cfg['datasets']
        sig = inspect.signature(self.partition)
        if 'model_dataset_combinations' in sig.parameters:
            combs = cfg.get('model_dataset_combinations', None)
            if combs is None:
                combs = [{'models': models, 'datasets': datasets}]
            else:
                # sanity check
                model_abbrs = [model_abbr_from_cfg(model) for model in models]
                dataset_abbrs = [
                    dataset_abbr_from_cfg(dataset) for dataset in datasets
                ]
                for comb in combs:
                    for model in comb['models']:
                        if model_abbr_from_cfg(model) not in model_abbrs:
                            raise ValueError(
                                f'Model {model_abbr_from_cfg(model)} '
                                'not found in config.')
                    for dataset in comb['datasets']:
                        if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
                            raise ValueError(
                                f'Dataset {dataset_abbr_from_cfg(dataset)} '
                                'not found in config.')
            used_kwargs = {'model_dataset_combinations': combs}
        else:
            if cfg.get('model_dataset_combinations', None) is not None:
                self.logger.warning(
                    'model_dataset_combinations is not supported by '
                    f'{self.__class__.__name__}. Ignored.')
            used_kwargs = {'models': models, 'datasets': datasets}
        return used_kwargs
    @abstractmethod
    def partition(self,
                  models: List[ConfigDict],
--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
        self.n = n
    def partition(self,
-                  models: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
-                  datasets: List[ConfigDict],
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[Dict]:
@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
            }
        Args:
-            models (List[ConfigDict]): A list of model configs.
+            model_dataset_combinations (List[Dict]): List of
-            datasets (List[ConfigDict]): A list of dataset configs.
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@ -60,20 +61,21 @@ class NaivePartitioner(BasePartitioner):
        """
        tasks = []
-        for model in models:
+        for comb in model_dataset_combinations:
-            chunks = []
+            for model in comb['models']:
-            for dataset in datasets:
+                chunks = []
-                filename = get_infer_output_path(model, dataset, out_dir)
+                for dataset in comb['datasets']:
-                if osp.exists(filename):
+                    filename = get_infer_output_path(model, dataset, out_dir)
-                    continue
+                    if osp.exists(filename):
-                chunks.append(dataset)
+                        continue
                    chunks.append(dataset)
-            for i in range(0, len(chunks), self.n):
+                for i in range(0, len(chunks), self.n):
-                task = Config({
+                    task = Config({
-                    'models': [model],
+                        'models': [model],
-                    'datasets': [chunks[i:i + self.n]],
+                        'datasets': [chunks[i:i + self.n]],
-                    'work_dir': work_dir,
+                        'work_dir': work_dir,
-                    **add_cfg
+                        **add_cfg
-                })
+                    })
-                tasks.append(task)
+                    tasks.append(task)
        return tasks
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
        self.strategy = strategy
    def partition(self,
-                  models: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
-                  datasets: List[ConfigDict],
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[ConfigDict]:
@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
            }
        Args:
-            models (List[ConfigDict]): A list of model configs.
+            model_dataset_combinations (List[Dict]): List of
-            datasets (List[ConfigDict]): A list of dataset configs.
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@ -84,52 +85,54 @@ class SizePartitioner(BasePartitioner):
            List[ConfigDict]: A list of tasks.
        """
        datasets = sorted(datasets,
                          key=lambda x: self.get_cost(x),
                          reverse=True)
        tasks = []
-        for model in models:
+        for comb in model_dataset_combinations:
-            chunks = []  # elements: tuple(size, dataset_chunk)
+            comb['datasets'] = sorted(comb['datasets'],
-            for dataset in datasets:
+                                      key=lambda x: self.get_cost(x),
-                filename = get_infer_output_path(model, dataset, out_dir)
+                                      reverse=True)
-                # skip the task if the task output exists
+            for model in comb['models']:
-                if osp.exists(filename):
+                chunks = []  # elements: tuple(size, dataset_chunk)
-                    continue
+                for dataset in comb['datasets']:
-                dataset_size = self.get_cost(dataset)
+                    filename = get_infer_output_path(model, dataset, out_dir)
-                if dataset_size > self.max_task_size:
+                    # skip the task if the task output exists
-                    root, ext = osp.splitext(filename)
+                    if osp.exists(filename):
-                    dataset_splits = self.split_dataset(dataset)
+                        continue
-                    for i, dataset_split in enumerate(dataset_splits):
+                    dataset_size = self.get_cost(dataset)
-                        if not osp.exists(f'{root}_{i}{ext}'):
+                    if dataset_size > self.max_task_size:
-                            chunks.append((self.max_task_size, dataset_split))
+                        root, ext = osp.splitext(filename)
-                else:
+                        dataset_splits = self.split_dataset(dataset)
-                    chunks.append((dataset_size, dataset))
+                        for i, dataset_split in enumerate(dataset_splits):
                            if not osp.exists(f'{root}_{i}{ext}'):
                                chunks.append(
                                    (self.max_task_size, dataset_split))
                    else:
                        chunks.append((dataset_size, dataset))
-            if self.strategy == 'heuristic':
+                if self.strategy == 'heuristic':
-                chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
+                    chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
-                current_size, current_chunks = 0, []
+                    current_size, current_chunks = 0, []
-                for index in range(len(chunks)):
+                    for index in range(len(chunks)):
-                    current_size += chunks[index][0]
+                        current_size += chunks[index][0]
-                    current_chunks.append(chunks[index][1])
+                        current_chunks.append(chunks[index][1])
-                    if index == len(chunks) - 1 or current_size + chunks[
+                        if index == len(chunks) - 1 or current_size + chunks[
-                            index + 1][0] > self.max_task_size:
+                                index + 1][0] > self.max_task_size:
                            tasks.append(
                                Config({
                                    'models': [model],
                                    'datasets': [current_chunks],
                                    'work_dir': work_dir,
                                    **add_cfg
                                }))
                            current_size, current_chunks = 0, []
                elif self.strategy == 'split':
                    for _, dataset in chunks:
                        tasks.append(
                            Config({
                                'models': [model],
-                                'datasets': [current_chunks],
+                                'datasets': [[dataset]],
                                'work_dir': work_dir,
                                **add_cfg
                            }))
                        current_size, current_chunks = 0, []
            elif self.strategy == 'split':
                for _, dataset in chunks:
                    tasks.append(
                        Config({
                            'models': [model],
                            'datasets': [[dataset]],
                            'work_dir': work_dir,
                            **add_cfg
                        }))
        return tasks
    @property
--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
 from tqdm import tqdm
 from opencompass.registry import RUNNERS, TASKS
-from opencompass.utils import get_logger
+from opencompass.utils import batched, get_logger
 from .base import BaseRunner
@ -131,15 +131,22 @@ class SlurmSequentialRunner(BaseRunner):
                        break
                parent_conn.close()
-            for job_id in tqdm(job_ids, desc='clear sruns'):
+            tbar = tqdm(total=len(job_ids), desc='clear sruns')
-                if job_id is None:
+            for batched_job_ids in batched(job_ids, 4):
-                    continue
+                ps = []
-                cmd = f'scancel {job_id}'
+                for job_id in batched_job_ids:
-                p = subprocess.Popen(cmd,
+                    tbar.update()
-                                     shell=True,
+                    if job_id is None:
-                                     stdout=subprocess.PIPE,
+                        continue
-                                     stderr=subprocess.STDOUT)
+                    cmd = f'scancel {job_id}'
-                p.wait()
+                    p = subprocess.Popen(cmd,
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
                    ps.append(p)
                for p in ps:
                    p.wait()
            tbar.close()
    def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
        logger = get_logger()
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
            pred_dicts = copy.deepcopy(preds)
            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
-            pred_strs = preds.pop('prediction')
+            pred_strs = preds.pop('prediction', None)
-            pred_list_flag = isinstance(pred_strs[0], list)
+            pred_list_flag = pred_strs is not None and isinstance(
                pred_strs[0], list)
            if ('pred_role' in self.eval_cfg
                    and 'meta_template' in self.model_cfg
                    and not MODELS.get(self.model_cfg['type']).is_api):
@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
                ]
            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
            # need results dir to save other files
            out_path = get_infer_output_path(
                self.model_cfg, self.dataset_cfg,
                osp.join(self.work_dir, 'results'))
            icl_evaluator._out_dir = osp.splitext(out_path)[
                0]  # strip extension
            preds['predictions'] = pred_strs
            preds['references'] = (test_set[self.output_column]
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
    return ''
@TEXT_POSTPROCESSORS.register_module('last-capital')
 def last_capital_postprocess(text: str) -> str:
    for t in text[::-1]:
        if t.isupper():
            return t
    return ''
 def first_option_postprocess(text: str, options: str) -> str:
    """Find first valid option for text."""
--- a/requirements/agent.txt
+++ b/requirements/agent.txt
@ -0,0 +1,7 @@
 json5
 jupyter
 jupyter_client
 jupytext
 lagent
 scikit-image
 sympy
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -1,4 +1 @@
 faiss_gpu==1.7.2
 jupyter
 lagent
 scikit-image