[Sync] minor test (#683)

2025-05-30 16:03:24 +08:00 · 2023-12-11 17:42:53 +08:00 · 2023-12-11 17:42:53 +08:00 · e78857ac36
commit e78857ac36
parent dd4318f6ab
57 changed files with 1468 additions and 314 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
+models
 configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/configs/datasets/CIBench/CIBench_gen.py
+++ b/configs/datasets/CIBench/CIBench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .CIBench_gen_eb42f9 import ci_datasets  # noqa: F401, F403
+    from .CIBench_gen_8ab0dc import ci_datasets  # noqa: F401, F403
--- a/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
+++ b/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
@ -16,28 +16,20 @@ cibench_infer_cfg = dict(
        template="""{questions}""",
    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=AgentInferencer),
+    inferencer=dict(type=AgentInferencer, infer_mode='every'),
 )


 libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
-cibench_eval_cfg = {
-    lib: dict(
-        evaluator=dict(
-            type=CIBenchEvaluator,
-            output_dir=f'output_data/cibench/{lib}'),
-        pred_role="BOT",
-    )
-    for lib in libs
-}
+cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")

 cibench_datasets = [
    dict(
-        abbr=f"cibench_{lib}",
+        abbr=f"cibench_generation_{lib}",
        type=CIBenchDataset,
        path=f"./data/cibench/{lib}",
        reader_cfg=cibench_reader_cfg,
        infer_cfg=cibench_infer_cfg,
-        eval_cfg=cibench_eval_cfg[lib],
+        eval_cfg=cibench_eval_cfg,
    ) for lib in libs
 ]
--- a/configs/datasets/MathBench/mathbench_agent_gen_568903.py
+++ b/configs/datasets/MathBench/mathbench_agent_gen_568903.py
@ -95,7 +95,7 @@ mathbench_sets = {
 # Use circular evaluation or not
 with_circular_eval = True

-mathbench_code_datasets = []
+mathbench_agent_datasets = []

 for _split in list(mathbench_sets.keys()):
    for _name in mathbench_sets[_split]:
@ -112,13 +112,13 @@ for _split in list(mathbench_sets.keys()):
            evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))

-        mathbench_code_datasets.append(
+        mathbench_agent_datasets.append(
            dict(
+                abbr="mathbench-" + _split + '-' + _name + '-agent',
                type=MathBenchDataset,
                path=f"./data/mathbench/{_split}",
                name=_name,
                with_circular=with_circular_eval,
-                abbr="mathbench-interpreter-" + _split + '-' + _name,
                reader_cfg=dict(
                    input_columns=["question"],
                    output_column="answer"
--- a/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py
+++ b/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py
@ -6,17 +6,17 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess

 cloze_prompts ={
    "cloze_arith_en": [
-                dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
-                dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
-                dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
-                dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
-                dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
-                dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
-                dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'), 
-                dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),    
-                dict(role='HUMAN', prompt='Q: {question}'),
-                dict(role='BOT', prompt='A: {answer}\n'),
-]
+        dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
+        dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
+        dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
+        dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
+        dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
+        dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
+        dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
+        dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
+        dict(role='HUMAN', prompt='Q: {question}'),
+        dict(role='BOT', prompt='A: {answer}\n'),
+    ]
 }

 mathbench_sets = {
--- a/configs/datasets/MathBench/mathbench_gen_ad37c1.py
+++ b/configs/datasets/MathBench/mathbench_gen_ad37c1.py
@ -94,11 +94,11 @@ for _split in list(mathbench_sets.keys()):

        mathbench_datasets.append(
            dict(
+                abbr="mathbench-" + _split + '-' + _name,
                type=MathBenchDataset,
                path=f"./data/mathbench/{_split}",
                name=_name,
                with_circular=with_circular_eval,
-                abbr="mathbench-" + _split + '-' + _name,
                reader_cfg=dict(
                    input_columns=["question"],
                    output_column="answer"
--- a/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
@ -0,0 +1,69 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
+                                  ds1000_matplotlib_postprocess,
+                                  DS1000Evaluator)
+
+ds1000_reader_cfg = dict(
+    input_columns=["prompt"],
+    output_column="test_column",
+    train_split='test',
+    test_split='test')
+
+ds1000_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt="{prompt}",
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ds1000_eval_cfg = dict(
+    evaluator=dict(type=DS1000Evaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type=ds1000_completion_postprocess),
+)
+
+# The DS-1000 dataset can be downloaded from
+# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
+ds1000_datasets = [
+    dict(
+        abbr=f"ds1000_{lib}",
+        type=DS1000Dataset,
+        path="./data/ds1000_data/",
+        libs=f"{lib}",
+        mode="Completion",
+        reader_cfg=ds1000_reader_cfg,
+        infer_cfg=ds1000_infer_cfg,
+        eval_cfg=ds1000_eval_cfg,
+    ) for lib in [
+        'Pandas',
+        'Numpy',
+        'Tensorflow',
+        'Scipy',
+        'Sklearn',
+        'Pytorch',
+    ]
+]
+ds1000_datasets.append(
+    dict(
+        abbr="ds1000_Matplotlib",
+        type=DS1000Dataset,
+        path="./data/ds1000_data/",
+        libs="Matplotlib",
+        mode="Completion",
+        reader_cfg=ds1000_reader_cfg,
+        infer_cfg=ds1000_infer_cfg,
+        eval_cfg=dict(
+            evaluator=dict(type=DS1000Evaluator),
+            pred_role="BOT",
+            pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
+        ),
+    ))
--- a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
@ -0,0 +1,68 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
+
+ds1000_reader_cfg = dict(
+    input_columns=["prompt"],
+    output_column="test_column",
+    train_split='test',
+    test_split='test')
+
+ds1000_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt="{prompt}",
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ds1000_eval_cfg_dict = {
+    lib: dict(
+        evaluator=dict(
+            type=DS1000ServiceEvaluator,
+            lib=lib,
+            ip_address=
+            "localhost",  # replace to your code_eval_server ip_address, port
+            port=5000
+            ),
+        pred_role="BOT")
+    for lib in [
+        'Pandas',
+        'Numpy',
+        'Tensorflow',
+        'Scipy',
+        'Sklearn',
+        'Pytorch',
+        'Matplotlib',
+    ]
+}
+
+# The DS-1000 dataset can be downloaded from
+# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
+ds1000_datasets = [
+    dict(
+        abbr=f"ds1000_{lib}",
+        type=DS1000Dataset,
+        path="./data/ds1000_data/",
+        libs=f"{lib}",
+        mode="Completion",
+        reader_cfg=ds1000_reader_cfg,
+        infer_cfg=ds1000_infer_cfg,
+        eval_cfg=ds1000_eval_cfg_dict[lib],
+    ) for lib in [
+        'Pandas',
+        'Numpy',
+        'Tensorflow',
+        'Scipy',
+        'Sklearn',
+        'Pytorch',
+        'Matplotlib',
+    ]
+]
--- a/configs/datasets/gsm8k/gsm8k_agent_gen_3ac57d.py
+++ b/configs/datasets/gsm8k/gsm8k_agent_gen_3ac57d.py
@ -45,7 +45,7 @@ gsm8k_eval_cfg = dict(

 gsm8k_datasets = [
    dict(
-        abbr='gsm8k',
+        abbr='gsm8k-agent',
        type=GSM8KDataset,
        path='./data/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
--- a/configs/datasets/gsm8k/gsm8k_gen_3309bd.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_3309bd.py
@ -0,0 +1,39 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
+                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
+                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
+                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
+                dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=[":", "Question:", "Question"]))
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
+                      pred_postprocessor=dict(type=gsm8k_postprocess),
+                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='./data/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
--- a/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py
+++ b/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py
@ -0,0 +1,57 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
+from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
+from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
+
+gsm8k_datasets = []
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template="{question} {answer}"),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLOnlyInferencer),
+)
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
+
+for split in ['train', 'test']:
+    gsm8k_reader_cfg = dict(
+        input_columns=['question', 'answer'],
+        output_column=None,
+        train_split=split,
+        test_split=split,
+    )
+    gsm8k_datasets.append(
+        dict(
+            abbr=f'gsm8k-{split}-ppl',
+            type=GSM8KDataset,
+            path='./data/gsm8k',
+            reader_cfg=gsm8k_reader_cfg,
+            infer_cfg=gsm8k_infer_cfg,
+            eval_cfg=gsm8k_eval_cfg)
+    )
+
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template="{text}"),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLOnlyInferencer),
+)
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
+
+gsm8k_reader_cfg = dict(
+    input_columns=['text'],
+    output_column=None,
+)
+
+gsm8k_datasets.append(
+    dict(
+        abbr=f'gsm8k-ref-ppl',
+        type=GSM8KReferenceSkywork,
+        path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg
+    )
+)
--- a/configs/datasets/math/math_agent_gen_861b4f.py
+++ b/configs/datasets/math/math_agent_gen_861b4f.py
@ -79,7 +79,7 @@ math_eval_cfg = dict(

 math_datasets = [
    dict(
-        abbr='math',
+        abbr='math-agent',
        type=MATHDataset,
        path='./data/math/math.json',
        reader_cfg=math_reader_cfg,
--- a/configs/datasets/winogrande/winogrande_ppl.py
+++ b/configs/datasets/winogrande/winogrande_ppl.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .winogrande_ppl_55a66e import winogrande_datasets  # noqa: F401, F403
+    from .winogrande_ppl_8be6c3 import winogrande_datasets  # noqa: F401, F403
--- a/configs/datasets/winogrande/winogrande_ppl_55a66e.py
+++ b/configs/datasets/winogrande/winogrande_ppl_55a66e.py
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import winograndeDataset

+# WARNING: This config cannot reproduce results in the paper.
+# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
+# Please try winogrande_ppl_8be6c3
+
 winogrande_reader_cfg = dict(
    input_columns=['opt1', 'opt2'],
    output_column='answer',
--- a/configs/datasets/winogrande/winogrande_ppl_8be6c3.py
+++ b/configs/datasets/winogrande/winogrande_ppl_8be6c3.py
@ -0,0 +1,33 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import winograndeDataset
+
+winogrande_reader_cfg = dict(
+    input_columns=['opt1', 'opt2'],
+    output_column='answer',
+)
+
+winogrande_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            1: "{opt1}",
+            2: "{opt2}",
+        }
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=LoglikelihoodInferencer))
+
+winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+winogrande_datasets = [
+    dict(
+        abbr='winogrande',
+        type=winograndeDataset,
+        path='./data/winogrande',
+        reader_cfg=winogrande_reader_cfg,
+        infer_cfg=winogrande_infer_cfg,
+        eval_cfg=winogrande_eval_cfg)
+]
--- a/configs/datasets/winogrande/winogrande_ppl_9307fd.py
+++ b/configs/datasets/winogrande/winogrande_ppl_9307fd.py
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import winograndeDataset

+# WARNING: This config cannot reproduce results in the paper.
+# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
+# Please try winogrande_ppl_8be6c3
+
 winogrande_reader_cfg = dict(
    input_columns=['opt1', 'opt2'],
    output_column='answer',
--- a/configs/eval_chat_agent.py
+++ b/configs/eval_chat_agent.py
@ -4,11 +4,20 @@ from opencompass.partitioners import SizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.models.lagent import LagentAgent
-from lagent import PythonInterpreter, ReAct
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from lagent import ReAct
 from lagent.agents.react import ReActProtocol

 with read_base():
-    from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets as datasets
+    from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets
+    from .datasets.math.math_agent_gen_861b4f import math_datasets
+    from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
+    from .summarizers.math_agent import summarizer
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_agent_datasets

 system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
 ```
--- a/configs/eval_chat_cibench.py
+++ b/configs/eval_chat_cibench.py
@ -10,7 +10,7 @@ from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask

 with read_base():
-    from .datasets.CIBench.CIBench_gen_eb42f9 import \
+    from .datasets.CIBench.CIBench_gen_8ab0dc import \
        cibench_datasets as datasets

 FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
@ -36,7 +36,21 @@ Also please follow the guidelines:
 3. The generated codes will be executed in an ipython manner and the results will be cached.
 4. Your responded code should always be simple and only solves the problem in current step.

-Begin!
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
 """

 IPYTHON_INTERPRETER_DESCRIPTION = '''\
@ -69,9 +83,6 @@ models = [
    ),
 ]

-for dataset in datasets:
-    # Evaluate on every assistant response
-    dataset['infer_cfg']['inferencer']['infer_mode'] = 'every'

 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=1000),
--- a/configs/eval_code_mathbench.py
+++ b/configs/eval_code_mathbench.py
@ -1,56 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.openai_api import OpenAI
-from opencompass.partitioners import SizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.models.lagent import LagentAgent
-from lagent import PythonInterpreter, ReAct
-from lagent.agents.react import ReActProtocol
-    
-system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
-```
-def solution():
-    variable_names_with_real_meaning = func(variable)
-    return variable_names_with_real_meaning
-```"""
-
-protocol = dict(
-    type=ReActProtocol,
-    action=dict(role="ACTION", begin="Tool:", end="\n"),
-    action_input=dict(role="ARGS", begin="Tool Input:", end="\n"),
-    finish=dict(role="FINISH", begin="FinalAnswer:", end="\n"),
-    call_protocol=system_prompt,
-)
-
-with read_base():
-    from .datasets.MathBench.mathbench_code_gen_568903 import mathbench_code_datasets as datasets
-    from .summarizers.mathbench import summarizer
-
-models = [
-    dict(
-        abbr='gpt-3.5-react',
-        type=LagentAgent,
-        agent_type=ReAct,
-        max_turn=3,
-        llm=dict(
-            type=OpenAI,
-            path='gpt-3.5-turbo',
-            key='ENV',
-            query_per_second=1,
-            max_seq_len=4096,
-        ),
-        actions=[
-            dict(type=PythonInterpreter),
-        ],
-        protocol=protocol,
-        batch_size=1,
-    ),
-]
-
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000),
-    runner=dict(
-        type=LocalRunner,
-        max_num_workers=16,
-        task=dict(type=OpenICLInferTask)),
-)
--- a/configs/eval_with_model_dataset_combinations.py
+++ b/configs/eval_with_model_dataset_combinations.py
@ -0,0 +1,43 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_base_models
+    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_models
+
+    from .datasets.ceval.ceval_ppl_578f8d import ceval_datasets as base_ceval_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets as chat_ceval_datasets
+
+    from .internal.clusters.slurm import infer, eval
+    # from .clusters.slurm import infer_split as infer, eval
+    # from .clusters.slurm import infer_size as infer, eval
+    # from .clusters.slurm import infer_size_split as infer, eval
+
+base_ceval_datasets = base_ceval_datasets[:1]
+chat_ceval_datasets = chat_ceval_datasets[-1:]
+
+# If you do not want to run all the combinations of models and datasets, you
+# can specify the combinations you want to run here. This is useful when you
+# deleberately want to skip some subset of the combinations.
+# Models and datasets in different combinations are recommended to be disjoint
+# (different `abbr` in model & dataset configs), as we haven't tested this case
+# throughly.
+model_dataset_combinations = [
+    dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
+    dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
+    # dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
+]
+
+# This union of models and datasets in model_dataset_combinations should be
+# stored in the `models` and `datasets` variables below. Otherwise, modules
+# like summarizer will miss out some information.
+models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
+datasets = [*base_ceval_datasets, *chat_ceval_datasets]
+
+work_dir = './outputs/default/mdcomb/'
+
+"""
+dataset                 version    metric    mode    qwen-7b-hf    qwen-7b-chat-hf
+----------------------  ---------  --------  ------  ------------  -----------------
+ceval-computer_network  9b9417     accuracy  ppl     52.63         -
+ceval-physician         6e277d     accuracy  gen     -             59.18
+"""
--- a/configs/models/hf_internlm/hf_internlm_chat_20b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_20b.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=2, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
+        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
+        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/summarizers/groups/cibench.py
+++ b/configs/summarizers/groups/cibench.py
@ -0,0 +1,4 @@
+
+_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
+_cibench = ['cibench_' + i for i in _cibench]
+cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
--- a/configs/summarizers/groups/mathbench.py
+++ b/configs/summarizers/groups/mathbench.py
@ -0,0 +1,75 @@
+
+mathbench_summary_groups = [
+    {
+        'name': 'mathbench-college',
+        'subsets': [
+            ['mathbench-college-single_choice_cn', 'acc_1'],
+            ['mathbench-college-cloze_en', 'accuracy'],
+        ]
+    },
+    {
+        'name': 'mathbench-high',
+        'subsets': [
+            ['mathbench-high-single_choice_cn', 'acc_1'],
+            ['mathbench-high-single_choice_en', 'acc_1'],
+        ]
+    },
+    {
+        'name': 'mathbench-middle',
+        'subsets': [
+            ['mathbench-middle-single_choice_cn', 'acc_1'],
+        ]
+    },
+    {
+        'name': 'mathbench-primary',
+        'subsets': [
+            ['mathbench-primary-cloze_cn', 'accuracy'],
+        ]
+    },
+    {
+        'name': 'mathbench',
+        'subsets': [
+            'mathbench-college',
+            'mathbench-high',
+            'mathbench-middle',
+            'mathbench-primary',
+        ],
+    },
+    {
+        'name': 'mathbench-college-circular',
+        'subsets': [
+            ['mathbench-college-single_choice_cn', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-high-circular',
+        'subsets': [
+            ['mathbench-high-single_choice_cn', 'perf_4'],
+            ['mathbench-high-single_choice_en', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-middle-circular',
+        'subsets': [
+            ['mathbench-middle-single_choice_cn', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-circular',
+        'subsets': [
+            'mathbench-college-circular',
+            'mathbench-high-circular',
+            'mathbench-middle-circular',
+        ],
+    },
+    {
+        'name': 'mathbench-circular-and-cloze',
+        'subsets': [
+            'mathbench-high-circular',
+            'mathbench-middle-circular',
+            'mathbench-circular',
+            'mathbench-college-cloze_en',
+            'mathbench-primary-cloze_cn',
+        ],
+    }
+]
--- a/configs/summarizers/math_agent.py
+++ b/configs/summarizers/math_agent.py
@ -0,0 +1,28 @@
+summarizer = dict(
+    dataset_abbrs=[
+        '######## GSM8K-Agent Accuracy ########', # category
+        ['gsm8k-agent', 'follow_acc'],
+        ['gsm8k-agent', 'reasoning_acc'],
+        ['gsm8k-agent', 'code_acc'],
+        ['gsm8k-agent', 'action_pct'],
+        '######## MATH-Agent Accuracy ########', # category
+        ['math-agent', 'follow_acc'],
+        ['math-agent', 'reasoning_acc'],
+        ['math-agent', 'code_acc'],
+        ['math-agent', 'action_pct'],
+        '######## MathBench-Agent Accuracy ########', # category
+        ['mathbench-college-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-college-cloze_en-agent', 'accuracy'],
+        ['mathbench-high-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-high-single_choice_en-agent', 'acc_1'],
+        ['mathbench-middle-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-primary-cloze_cn-agent', 'accuracy'],
+        '######## MathBench-Agent CircularEval ########', # category
+        ['mathbench-college-single_choice_cn-agent', 'perf_4'],
+        ['mathbench-high-single_choice_cn-agent', 'perf_4'],
+        ['mathbench-high-single_choice_en-agent', 'perf_4'],
+        ['mathbench-middle-single_choice_cn-agent', 'perf_4'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
@ -2,13 +2,15 @@ import json
 import os
 import os.path as osp
 import re
+import subprocess
+from collections import defaultdict
 from typing import List, Optional

 import numpy as np
 from datasets import Dataset

 from opencompass.openicl.icl_evaluator import BaseEvaluator
-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET

 from .base import BaseDataset

@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
    with open(file, 'r') as f:
        notebook = json.load(f)
        example = notebook['cells']
-
+        metadata = notebook['metadata']
+        modules = metadata.get('modules', [])
+        if modules:
+            # these two annotations should be the same
+            assert len(modules) == len(metadata.get('step_types'))
+            # reformat annotations
+            modules = [[_m.strip() for _m in _modules.split('&')]
+                       for _modules in modules]
        questions = []
+        source_codes = []
        outputs = []
        tags = []
        for cell in example:
            if cell['cell_type'] == 'markdown':
-                text = ''.join(cell['source'])
+                text = ''.join(cell['source']).strip()
+                if modules:
+                    _modules = modules.pop(0)
+                    text += f"Please use {' and '.join(_modules)} modules."
+                text = text.strip() + '\n'
                # append the formatted text
                questions.append(text)
            elif cell['cell_type'] == 'code':
+                source_codes.append(''.join(cell['source']))
                if cell['outputs'] and 'data' in cell['outputs'][-1]:
                    if 'image/png' in cell['outputs'][-1]['data']:
                        # skip vis temporarily due to lack of evaluation
@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
                        outputs.append(''.join(
                            cell['outputs'][-1]['data']['text/plain']))
                else:
-                    tags.append('executable')
+                    tags.append('exec')
                    outputs.append(None)
    return dict(
        experiment=file,
        questions=sum(([
            dict(role='user', content=question),
-            dict(role='assistant', content=output)
-        ] for question, output in zip(questions, outputs)), []),
-        references=dict(outputs=outputs, tags=tags, experiment=file),
+            dict(role='assistant', content=source_code)
+        ] for question, source_code in zip(questions, source_codes)), []),
+        references=dict(outputs=outputs,
+                        tags=tags,
+                        metadata=metadata,
+                        experiment=file),
    )


@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        """Load whole dataset."""
+        assert os.path.exists(path), f'Path {path} does not exist.'
        data_list = []
        for cwd, dirs, files in os.walk(path):
            dirs.sort()
@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
    """Evaluator for CI dataset.

    Args:
+        text_evaluator (optional, dict): The text evaluator for text result
+            comparison[]. Defaults to None, which use Rouge as defaults.
+            Please notice that a extra key for `metric_name` should be set
+            to get the exact metric result, such as `rouge1`.
        output_dir (optional, str): The directory to save experiment
            files in a markdown or notebook format.
+        with_ipynb (bool): Generate ipynb correspondingly.
+            Defaults to False.
        user_data_dir (str): The directory to load local files.
            Defaults to 'ENV', which means use environment variable
            `USER_DATA_DIR` to get the data dir.
    """

    def __init__(self,
+                 text_evaluator: Optional[dict] = None,
                 output_dir: Optional[str] = None,
+                 with_ipynb: bool = False,
                 user_data_dir: str = 'ENV') -> None:
+        if text_evaluator is None:
+            from opencompass.openicl.icl_evaluator import RougeEvaluator
+            self.text_evaluator = ICL_EVALUATORS.build(
+                dict(type=RougeEvaluator))
+            self.text_eval_metric = 'rouge1'
+        else:
+            self.text_eval_metric = text_evaluator.pop('metric_name')
+            self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
        # TODO: should use work dir for this task.
        self.output_dir = output_dir
+        self.user_data_dir = self.check_user_data_dir(user_data_dir)
+        self.with_ipynb = with_ipynb
+        self.TAG_MAPPING = {
+            'exec': ('executable', self.valid_step),
+            'general': ('general_correct', self.correct_step),
+            'num': ('numeric_correct', self.correct_step),
+            'text': ('text_score', self.text_step),
+            'vis': ('vis_sim', self.vis_similarity_step),
+        }
+
+    def check_user_data_dir(self, user_data_dir):
        if user_data_dir == 'ENV':
            user_data_dir = os.environ.get('USER_DATA_DIR', '')
-        self.user_data_dir = user_data_dir
+        user_data_dir = user_data_dir.rstrip('/')
+        basename = osp.basename(user_data_dir)
+        if basename and basename != 'data':
+            user_data_dir = osp.join(user_data_dir, 'data')
+            assert osp.exists(user_data_dir), \
+                f'a subfolder named `data` should exist under {user_data_dir}.'
+        elif basename:
+            assert osp.exists(user_data_dir), \
+                f'{user_data_dir} does not exist.'
+        return user_data_dir

    @staticmethod
    def valid_step(step):
@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
        # Fall back to False
        return False

+    def text_step(self, step, target):
+        """Whether the step output is correct."""
+        # Found the latest code interpreter to determine correct
+        for action in step[::-1]:
+            if action['type'] == 'IPythonInterpreter':
+                if action['result']:
+                    try:
+                        pred = action['result']['text']
+                        match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
+                        if match:
+                            out = match.group(1)
+                            score = self.text_evaluator.score([out], [target])
+                            return score[self.text_eval_metric] / 100
+                    except Exception:
+                        return False
+        # Fall back to False
+        return False
+
    @staticmethod
    def vis_similarity_step(step, target):
        """Whether the step output image has the same structure similarity with
@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
                'the conversion processes.')

        check_jupytext()
+        p_list = []
        from opencompass.lagent.actions.ipython_interpreter import extract_code
        for idx, (example_origin_prompt,
                  example_steps) in enumerate(zip(origin_prompt, steps)):
@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
                f.writelines(markdown_lines)

            # TODO: be careful for this
+            # The result might be different with infer process
+            # please check carefully
            # convert markdown to ipynb and exectue with error tolerance
-            # subprocess.Popen(
-            #     "jupytext --to ipynb --pipe-fmt ipynb "
-            #     "--pipe 'jupyter nbconvert --to ipynb --execute "
-            #     f"--allow-errors --stdin --stdout' {md_file}",
-            #     shell=True)
+            if self.with_ipynb:
+                p = subprocess.Popen(
+                    'jupytext --to ipynb --pipe-fmt ipynb '
+                    "--pipe 'jupyter nbconvert --to ipynb --execute "
+                    f"--allow-errors --stdin --stdout' {md_file}",
+                    shell=True)
+                p_list.append(p)
+        # TODO: async wait
+        for p in p_list:
+            p.wait()

    def set_data_dir(self, work_dir):
        """Set work directory and link data files for save notebook results."""
        if self.user_data_dir:
-            if self.user_data_dir.endswith('/'):
-                basename = osp.basename(osp.split(self.user_data_dir)[0])
-            else:
-                basename = osp.basename(self.user_data_dir)
+            basename = osp.basename(self.user_data_dir)
+
            if not osp.exists(osp.join(self.output_dir, basename)):
                os.symlink(self.user_data_dir,
                           osp.join(self.output_dir, basename))
@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
        """Change work directory and keep the symlink."""
        os.chdir(work_dir)

+    def single_exp(self, gold, steps):
+        tags = gold['tags']
+        outputs = gold['outputs']
+        metadata = gold['metadata']
+        hard_tags = metadata.get('step_types', [])
+        if hard_tags:
+            tags = hard_tags
+
+        # executable: exec succeed
+        # general_correct: general correct
+        # numeric_correct: numerical correct
+        # text_score: text score
+        # vis_sim: visual similarity
+        result = defaultdict(list)
+        for tag, step, output in zip(tags, steps, outputs):
+            # check whether this step is valid
+            result['executable'].append(self.valid_step(step))
+            if tag != 'exec':
+                key, func = self.TAG_MAPPING[tag]
+                result[key].append(func(step, output))
+
+        # add missing metric for better analyse if not exists
+        if hard_tags:
+            check_tags = ['exec', 'num', 'text', 'vis']
+        else:
+            check_tags = ['exec', 'general', 'vis']
+        for tag in check_tags:
+            key = self.TAG_MAPPING[tag][0]
+            if key not in result:
+                result[key] = []
+
+        return result
+
+    def get_output_dir(self):
+        """Get output dir from eval task.
+
+        Notice: output dir should be in format xxx/data.
+        All the needed files should be
+        """
+        # hard hack for get output dir from eval task
+        if hasattr(self, '_out_dir') and self.output_dir is None:
+            self.output_dir = self._out_dir
+
    def score(self, predictions: List, references: List, steps: List,
              origin_prompt: List):
        """Calculate accuracy."""
        cwd = os.getcwd()
+        self.get_output_dir()
        if self.output_dir:
            if not osp.exists(self.output_dir):
                os.makedirs(self.output_dir)
@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
            self.save_results(origin_prompt, steps)
            self.unset_data_dir(cwd)

-        num_cells_list = []
-        num_general_list = []
-        passed_list = []
-        correct_list = []
-        vis_list = []
+        total_results = defaultdict(float)
+        total_scores = defaultdict(float)
+        total_nums = defaultdict(int)
        for gold, single_steps in zip(references, steps):
-            tags = gold['tags']
-            outputs = gold['outputs']
-            num_cells = len(tags)
-            num_general = sum([tag == 'general' for tag in tags])
+            result = self.single_exp(gold, single_steps)

-            passed = sum([self.valid_step(step) for step in single_steps])
-            correct = 0
-            vis_sim = []
-            for tag, step, output in zip(tags, single_steps, outputs):
-                if tag == 'general':
-                    correct += self.correct_step(step, output)
-                elif tag == 'vis':
-                    vis_sim.append(self.vis_similarity_step(step, output))
+            for k, v in result.items():
+                total_scores[k] += sum(v)
+                total_nums[k] += len(v)

-            num_cells_list.append(num_cells)
-            num_general_list.append(num_general)
-            passed_list.append(passed)
-            correct_list.append(correct)
-            if vis_sim:
-                vis_list.append(sum(vis_sim) / len(vis_sim))
+        for k, v in total_scores.items():
+            if total_nums[k] > 0:
+                total_results[k] = total_scores[k] / total_nums[k] * 100
            else:
-                vis_list.append(-1)
+                total_results[k] = -1

-        if len([v for v in vis_list if v >= 0]) > 0:
-            visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
-                [v for v in vis_list if v >= 0])
-        else:
-            # not valid
-            visualize_similarity = -1
-
-        if sum(num_general_list) > 0:
-            general_accuracy = sum(correct_list) / sum(num_general_list)
-        else:
-            # not valid
-            general_accuracy = -1
-
-        result = dict(
-            executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
-            general_accuracy=general_accuracy * 100,
-            visualize_similarity=visualize_similarity * 100,
-            num_cells_list=num_cells_list,
-            num_general_list=num_general_list,
-            passed_list=passed_list,
-            correct_list=correct_list,
-            vis_list=vis_list,
-        )
-        return result
+        return total_results
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
+                if line['label'] == '-':
+                    continue
                data.append(line)
        return Dataset.from_list(data)

--- a/opencompass/datasets/ds1000.py
+++ b/opencompass/datasets/ds1000.py
@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
    return text


+@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
+def ds1000_completion_postprocess(text: str) -> str:
+    text += '</code>'
+
+    match = re.search('(.*?)</code>', text, re.DOTALL)
+    if match:
+        text = match.group(1)
+
+    return text
+
+
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
 def ds1000_matplotlib_postprocess(text: str) -> str:
    text = ds1000_postprocess(text)
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
            reasoning_acc=100 *
            (reasoning_scope + final_scope + row_reasoning_scope) / total,
            code_acc=100 * (code_scope + final_scope) / total,
-            action_acc=100 * (action_scope + final_scope) / total,
+            action_pct=100 * (action_scope + final_scope) / total,
        )
        return result
--- a/opencompass/datasets/wikibench.py
+++ b/opencompass/datasets/wikibench.py
@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
        circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']

        data = []
-        with open(path, 'r') as infile:
+        with open(path, 'r', encoding='utf-8') as infile:
            for id, line in enumerate(infile):
                entry = json.loads(line)
                if 'cloze' in name:
--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
            for line in f:
                line = json.loads(line)
                prompt = line['sentence']
-                dataset_list.append({
-                    'opt1':
-                    prompt.replace('_', line['option1']),
-                    'opt2':
-                    prompt.replace('_', line['option2']),
-                    'answer':
-                    line['answer']
-                })
+                continue_prompt = prompt.split('_')
+                data_item = {
+                    'opt1': prompt.replace('_', line['option1']),
+                    'opt2': prompt.replace('_', line['option2']),
+                    'answer': line['answer'],
+                    'cont': continue_prompt[1]
+                }
+                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list

@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
                prompt = line['sentence']
                answer = line['answer']
                answer = ' AB'[int(answer)] if answer != '' else 'NULL'
-                dataset_list.append({
-                    'opt1':
-                    prompt.replace('_', line['option1']),
-                    'opt2':
-                    prompt.replace('_', line['option2']),
-                    'answer':
-                    answer
-                })
+                data_item = {
+                    'opt1': prompt.replace('_', line['option1']),
+                    'opt2': prompt.replace('_', line['option2']),
+                    'answer': answer,
+                }
+                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
            it is disabled. Defaults to None.
        timeout (int): Upper bound of waiting time for Python script execution.
            Defaults to 20.
+        trim_output (int, optional): Max characters restriction of ipython
+            outputs. If None, do not perform any trim.
+            TODO: Notice that, this is not token len. Anf trim strategies
+            might be added later. Defaults to 1024.
        user_data_dir (str): Specified the user data directory for files
            loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
            Defaults to `ENV`.
@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
                 enable: bool = True,
                 disable_description: Optional[str] = None,
                 timeout: int = 20,
+                 trim_output: Optional[int] = 1024,
                 user_data_dir: str = 'ENV') -> None:
        super().__init__(description, name, enable, disable_description)

@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
            user_data_dir = os.environ.get('USER_DATA_DIR', '')

        if user_data_dir:
-            user_data_dir = os.path.dirname(user_data_dir)
+            # user_data_dir = os.path.dirname(user_data_dir)
            user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
        self.user_data_dir = user_data_dir
        self._initialized = False
+        self.trim_output = trim_output
        if not os.path.exists(WORK_DIR):
            os.mkdir(WORK_DIR)

@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
                if image:
                    result += f'\n\n{image}'
                if finished:
+                    # in case output text too long
+                    # might need better design later
+                    if self.trim_output and len(result) > self.trim_output:
+                        ellip = '......'
+                        half_len = int((self.trim_output - len(ellip)) / 2)
+                        result = result[:half_len] + ellip + result[-half_len:]
                    return succeed, result

        try:
@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
                 command: str,
                 timeout: Optional[int] = None) -> ActionReturn:
        tool_return = ActionReturn(url=None, args=None, type=self.name)
-        tool_return.args = dict(text=command)
-        succeed, result = self._call(command, timeout)
-        if succeed:
-            tool_return.result = dict(text=result)
-            tool_return.state = ActionStatusCode.SUCCESS
+        extracted_command = extract_code(command)
+        tool_return.args = dict(text=command, extract_code=extracted_command)
+        if extracted_command:
+            succeed, result = self._call(extracted_command, timeout)
+            if succeed:
+                if not result:
+                    result = 'The code is succeed without any outputs.'
+                tool_return.result = dict(text=result)
+                tool_return.state = ActionStatusCode.SUCCESS
+            else:
+                tool_return.errmsg = repr(result)
+                tool_return.state = ActionStatusCode.API_ERROR
        else:
-            tool_return.errmsg = repr(result)
+            tool_return.errmsg = 'The input code is empty. Please follow the format.'  # noqa
            tool_return.state = ActionStatusCode.API_ERROR
        return tool_return

--- a/opencompass/models/base.py
+++ b/opencompass/models/base.py
@ -115,6 +115,20 @@ class BaseModel:
        inputs = self.parse_template(templates, mode='ppl')
        return self.get_ppl(inputs, mask_length)

+    def get_loglikelihood_from_template(self,
+                                        templates: List[PromptType],
+                                        conts: List[str],
+                                        mask_length=None):
+        """Get perplexity given a list of templates.
+
+        Args:
+            templates (List[PromptType]): A list of templates.
+            mask_length (List[int]): A list of mask lengths. If provided, the
+                perplexity will be calculated only on the unmasked tokens.
+        """
+        inputs = self.parse_template(templates, mode='ppl')
+        return self.get_loglikelihood(inputs, conts, mask_length)
+
    def generate_from_template(self, templates: List[PromptType],
                               max_out_len: int, **kwargs):
        """Generate completion from a list of templates.
--- a/opencompass/models/base_api.py
+++ b/opencompass/models/base_api.py
@ -1,9 +1,11 @@
 import re
 import sys
 import threading
+import time
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
+from queue import Queue
 from time import sleep
 from typing import Dict, List, Optional, Tuple, Union

@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
    def __init__(self,
                 path: str,
                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
                 retry: int = 2,
                 max_seq_len: int = 2048,
                 meta_template: Optional[Dict] = None,
@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
        self.meta_template = meta_template
        self.retry = retry
        self.query_per_second = query_per_second
-        self.token_bucket = TokenBucket(query_per_second)
+        self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
        self.template_parser = APITemplateParser(meta_template)
        self.logger = get_logger()
        self.generation_kwargs = generation_kwargs
@ -422,10 +425,13 @@ class TokenBucket:
        query_per_second (float): The rate of the token bucket.
    """

-    def __init__(self, rate):
+    def __init__(self, rate, verbose=False):
        self._rate = rate
        self._tokens = threading.Semaphore(0)
        self.started = False
+        self._request_queue = Queue()
+        self.logger = get_logger()
+        self.verbose = verbose

    def _add_tokens(self):
        """Add tokens to the bucket."""
@ -440,3 +446,12 @@ class TokenBucket:
            self.started = True
            threading.Thread(target=self._add_tokens, daemon=True).start()
        self._tokens.acquire()
+        if self.verbose:
+            cur_time = time.time()
+            while not self._request_queue.empty():
+                if cur_time - self._request_queue.queue[0] > 60:
+                    self._request_queue.get()
+                else:
+                    break
+            self._request_queue.put(cur_time)
+            self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union

 import numpy as np
 import torch
+import transformers

 from opencompass.models.base import BaseModel
 from opencompass.models.base_api import APITemplateParser
@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
 PromptType = Union[PromptList, str]


+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        batch_size: int,
+    ):
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence,
+                                             add_special_tokens=False)
+        self.sequence_id_len = len(self.sequence_ids)
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # compare the last len(stop) tokens
+        lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if done:
+                continue
+            self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
@MODELS.register_module()
 class HuggingFace(BaseModel):
    """Model wrapper around HuggingFace models.
@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
            self.model.config.eos_token_id = 2
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

-    def generate(self, inputs: List[str], max_out_len: int,
+    def generate(self,
+                 inputs: List[str],
+                 max_out_len: int,
+                 stopping_criteria: List[str] = [],
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.

@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
                                        max_out_len=max_out_len,
                                        **generation_kwargs)
        else:
-            return sum((self._single_generate(
-                inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
-                        for input_ in inputs), [])
+            return sum(
+                (self._single_generate(inputs=[input_],
+                                       max_out_len=max_out_len,
+                                       stopping_criteria=stopping_criteria,
+                                       **generation_kwargs)
+                 for input_ in inputs), [])

    def _batch_generate(self, inputs: List[str], max_out_len: int,
                        **kwargs) -> List[str]:
@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
            decodeds = [token.split(self.end_str)[0] for token in decodeds]
        return decodeds

-    def _single_generate(self, inputs: List[str], max_out_len: int,
+    def _single_generate(self,
+                         inputs: List[str],
+                         max_out_len: int,
+                         stopping_criteria: List[str] = [],
                         **kwargs) -> List[str]:
        """Support for single prompt inference.

@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
                                   max_length=self.max_seq_len -
                                   max_out_len)['input_ids']
        input_ids = torch.tensor(input_ids, device=self.model.device)
+
+        if stopping_criteria:
+            # Construct huggingface stopping criteria
+            stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
+            stopping_criteria = transformers.StoppingCriteriaList([
+                *[
+                    MultiTokenEOSCriteria(sequence, self.tokenizer,
+                                          input_ids.shape[0])
+                    for sequence in stopping_criteria
+                ],
+            ])
+            kwargs['stopping_criteria'] = stopping_criteria
+
        # To accommodate the PeftModel, parameters should be passed in
        # key-value format for generate.
        outputs = self.model.generate(input_ids=input_ids,
@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
        return ce_loss

+    def get_loglikelihood(
+            self,
+            inputs: List[str],
+            conts: List[str],
+            mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get loglikelihood scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            conts (List[str]): A list of strings: slices after the space.
+            NOT SUPPORT mask_length YET!
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of loglikelihood scores.
+        """
+        assert mask_length is None, 'Not support mask_length yet.'
+        if self.batch_padding and len(inputs) > 1:
+            raise NotImplementedError('Batch padding is not supported yet.')
+            # assert self.tokenizer.pad_token
+            # return self._get_loglikelihood(inputs, mask_length=mask_length)
+        return np.array([
+            self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
+            for idx in range(len(inputs))
+        ])
+
+    def _get_loglikelihood(self, inputs: str, conts: str) -> float:
+        """Get loglikelihood scores given input string and continuation string.
+
+        Args:
+            inputs (str): string.
+            conts (str): strings: slices after the space.
+        Returns:
+            float: loglikelihood scores.
+        """
+
+        input_ids = self.tokenizer(inputs,
+                                   padding=False,
+                                   truncation=True,
+                                   max_length=self.max_seq_len)['input_ids']
+        input_ids = torch.tensor(input_ids, device=self.model.device)
+        context_ids = self.tokenizer(inputs.replace(conts, ''),
+                                     padding=False,
+                                     truncation=True,
+                                     max_length=self.max_seq_len)['input_ids']
+        cont_ids = input_ids[len(context_ids):]
+
+        output = self.model(input_ids.unsqueeze(0))
+        logits = output['logits'][:, :-1]
+        logits = torch.nn.functional.log_softmax(logits, dim=-1)
+        contlen = cont_ids.shape[0]
+        logits = logits[:, -contlen:, :]
+        # Reducing the dimension will lead to a wrong outcome
+        logits_gather = torch.gather(
+            logits, 2,
+            cont_ids.unsqueeze(0).unsqueeze(-1))  # [1, seq]
+
+        # Answer: sum the likelihood of each token in continuation
+        answer = float(logits_gather.detach().cpu().sum())
+        return answer
+
    def get_token_len(self, prompt: str) -> int:
        """Get lengths of the tokenized strings.

@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
                        'role': {
                            'HUMAN': 'user',
                            'BOT': 'assistant',
-                            'SYSTEM': 'system'
-                        }[item['role']]
+                            'SYSTEM': 'system',
+                        }[item['role'].upper()]
                    }
                    history.append(msg)
            user_content = history[-1]['content']
@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
                response, history = self.model.chat(self.tokenizer,
                                                    user_content,
                                                    history=history)
+                # response will be dict sometime
+                if isinstance(response, dict):
+                    response = response.get('content', '')
                responses.append(response)
            except Exception:
                responses.append('')
--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@ -52,7 +52,7 @@ class LagentAgent:

    def chat(self,
             user_input: str,
-             history: List[dict] = None) -> Tuple[str, List[dict]]:
+             history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
        """Chat with agent."""
        if history:
            self.agent._session_history = history
@ -60,6 +60,7 @@ class LagentAgent:
        from lagent.schema import ActionReturn, AgentReturn
        generation: AgentReturn = self.agent.chat(user_input)

+        inner_steps = generation.inner_steps
        answer = generation.response
        steps = []

@ -76,7 +77,7 @@ class LagentAgent:
                    valid=int(step.valid),
                ))

-        return answer, steps
+        return answer, steps, inner_steps


 FORCE_STOP_PROMPT_EN = (
--- a/opencompass/models/llama2.py
+++ b/opencompass/models/llama2.py
@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
                dialog = []
                for item in input:
                    msg = {'content': item['prompt']}
-                    if item['role'] == 'HUMAN':
+                    if item['role'].upper() == 'HUMAN':
                        msg['role'] = 'user'
-                    elif item['role'] == 'BOT':
+                    elif item['role'].upper() == 'BOT':
                        msg['role'] = 'assistant'
-                    elif item['role'] == 'SYSTEM':
+                    elif item['role'].upper() == 'SYSTEM':
                        msg['role'] = 'system'
+                    else:
+                        raise ValueError(f'Unknown role: {item["role"]}')
                    dialog.append(msg)
            dialogs.append(dialog)

--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
                 path: str = 'gpt-3.5-turbo',
                 max_seq_len: int = 4096,
                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
                 retry: int = 2,
                 key: Union[str, List[str]] = 'ENV',
                 org: Optional[Union[str, List[str]]] = None,
@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
                         max_seq_len=max_seq_len,
                         meta_template=meta_template,
                         query_per_second=query_per_second,
+                         rpm_verbose=rpm_verbose,
                         retry=retry)
        import tiktoken
        self.tiktoken = tiktoken
--- a/opencompass/openicl/icl_evaluator/init.py
+++ b/opencompass/openicl/icl_evaluator/init.py
@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
+from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
@ -0,0 +1,11 @@
+from opencompass.registry import ICL_EVALUATORS
+
+from .icl_base_evaluator import BaseEvaluator
+
+
+@ICL_EVALUATORS.register_module()
+class AveragePPLEvaluator(BaseEvaluator):
+
+    def score(self, ppl):
+        average_ppl = sum(ppl) / len(ppl)
+        return {'average_ppl': average_ppl}
--- a/opencompass/openicl/icl_inferencer/init.py
+++ b/opencompass/openicl/icl_inferencer/init.py
@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer  # noqa
 from .icl_chat_inferencer import ChatInferencer  # noqa
 from .icl_clp_inferencer import CLPInferencer  # noqa
 from .icl_gen_inferencer import GenInferencer  # noqa
+from .icl_loglikelihood_inferencer import LoglikelihoodInferencer  # noqa
 from .icl_ppl_inferencer import PPLInferencer  # noqa
+from .icl_ppl_only_inferencer import PPLOnlyInferencer  # noqa
 from .icl_sc_inferencer import SCInferencer  # noqa
 from .icl_tot_inferencer import ToTInferencer  # noqa
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):

        user_idx = assistant_indices[-1] - 1
        self.model.set_history(chat[:user_idx])
-        answer, steps = self.model.chat(chat[user_idx]['content'])
+        answer, steps, _ = self.model.chat(chat[user_idx]['content'])
        output_handler.save_results(
            origin_prompt=chat[user_idx]['content'],
            prediction=answer,
@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
            i for i, item in enumerate(chat) if item['role'] == 'assistant'
        ]

-        self.model.set_history(chat[:assistant_indices[0] - 1])
-
+        history = chat[:assistant_indices[0] - 1]
        for i in assistant_indices:
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, inner_steps = self.model.chat(
+                chat[i - 1]['content'], history)
+            history += inner_steps
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,
@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):

        for i in assistant_indices:
            self.model.set_history(chat[:i - 1])
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, _ = self.model.chat(chat[i - 1]['content'])
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,
--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@ -68,11 +68,11 @@ class LMTemplateParser:
        prompt = ''
        if self.roles:
            for dialog in chat:
-                role_cfg = self.roles.get(dialog['role'])
-                prompt += role_cfg['begin']
+                role_cfg = self.roles.get(dialog['role'], {})
+                prompt += (role_cfg.get('begin') or '')
                prompt += (dialog.get('content') or '')
-                prompt += role_cfg['end']
-            prompt += self.roles['assistant']['begin']
+                prompt += (role_cfg.get('end') or '')
+            prompt += (self.roles['assistant'].get('begin') or '')
        else:
            # in case the model does not have any meta template
            last_sep = ''
@ -227,9 +227,13 @@ class ChatInferencer(BaseInferencer):
                                         'tmp_' + output_json_filename)
        if osp.exists(tmp_json_filepath):
            # TODO: move resume to output handler
-            tmp_result_dict = mmengine.load(tmp_json_filepath)
-            output_handler.results_dict = tmp_result_dict
-            index = len(tmp_result_dict)
+            try:
+                tmp_result_dict = mmengine.load(tmp_json_filepath)
+            except Exception:
+                pass
+            else:
+                output_handler.results_dict = tmp_result_dict
+                index = len(tmp_result_dict)

        # 4. Wrap prompts with Dataloader
        dataloader = self.get_dataloader(chat_list[index:], batch_size=1)
--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
@ -1,5 +1,6 @@
 """Direct Generation Inferencer."""

+import inspect
 import os
 import os.path as osp
 from typing import List, Optional
@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
            self,
            model: BaseModel,
            max_out_len: int,
+            stopping_criteria: List[str] = [],
            max_seq_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            gen_field_replace_token: Optional[str] = '',
@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):

        self.gen_field_replace_token = gen_field_replace_token
        self.max_out_len = max_out_len
+        self.stopping_criteria = stopping_criteria

        if self.model.is_api and save_every is None:
            save_every = 1
@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
                entry = datum
                golds = [None for _ in range(len(entry))]
            # 5-1. Inference with local model
+            extra_gen_kwargs = {}
+            sig = inspect.signature(self.model.generate)
+            if 'stopping_criteria' in sig.parameters:
+                extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
            with torch.no_grad():
                parsed_entries = self.model.parse_template(entry, mode='gen')
                results = self.model.generate_from_template(
-                    entry, max_out_len=self.max_out_len)
+                    entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
                generated = results

            num_return_sequences = getattr(self.model, 'generation_kwargs',
--- a/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
@ -0,0 +1,215 @@
+"""PPL Inferencer."""
+
+import os
+from typing import List, Optional
+
+import torch
+from tqdm import trange
+
+from opencompass.models.base import BaseModel
+from opencompass.registry import ICL_INFERENCERS
+
+from ..icl_prompt_template import PromptTemplate
+from ..icl_retriever import BaseRetriever
+from ..utils import get_logger
+from .icl_base_inferencer import BaseInferencer, dump_results_dict
+
+logger = get_logger(__name__)
+
+
+@ICL_INFERENCERS.register_module()
+class LoglikelihoodInferencer(BaseInferencer):
+    """Loglikelihood Inferencer class to evaluate by loglikelihood.
+
+    Attributes:
+        model (:obj:`BaseModel`, optional): The module to inference.
+        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
+            the LM.
+        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
+        output_json_filepath (:obj:`str`, optional): File path for output
+            `JSON` file.
+        output_json_filename (:obj:`str`, optional): File name for output
+            `JSON` file.
+        labels (:obj:`List`, optional): A list of labels for all classes.
+    """
+
+    def __init__(
+            self,
+            model: BaseModel,
+            max_seq_len: Optional[int] = None,
+            batch_size: Optional[int] = 1,
+            output_json_filepath: Optional[str] = './icl_inference_output',
+            output_json_filename: Optional[str] = 'predictions',
+            labels: Optional[List] = None,
+            **kwargs) -> None:
+        super().__init__(
+            model=model,
+            max_seq_len=max_seq_len,
+            batch_size=batch_size,
+            output_json_filename=output_json_filename,
+            output_json_filepath=output_json_filepath,
+            **kwargs,
+        )
+
+        self.labels = labels
+
+    def inference(self,
+                  retriever: BaseRetriever,
+                  ice_template: Optional[PromptTemplate] = None,
+                  prompt_template: Optional[PromptTemplate] = None,
+                  output_json_filepath: Optional[str] = None,
+                  output_json_filename: Optional[str] = None) -> List:
+        # 1. Preparation for output logs
+        output_handler = LoglikelihoodInferencerOutputHandler()
+
+        sub_predictions = []
+        ppl = []
+        ice = []
+
+        if output_json_filepath is None:
+            output_json_filepath = self.output_json_filepath
+        if output_json_filename is None:
+            output_json_filename = self.output_json_filename
+
+        # 2. Get results of retrieval process
+        ice_idx_list = retriever.retrieve()
+
+        # 3. Get labels of all the classes
+        if self.labels is None:
+            labels = retriever.get_labels(ice_template=ice_template,
+                                          prompt_template=prompt_template)
+        else:
+            labels = self.labels
+
+        # 4. Generate in-context examples for testing inputs
+        for idx in range(len(ice_idx_list)):
+            ice.append(
+                retriever.generate_ice(ice_idx_list[idx],
+                                       ice_template=ice_template))
+        output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
+
+        # 5. Calculating loglikelihood for prompts in each label's class
+        for label in labels:
+            index = 0
+            prompt_list = []
+            sub_ppl_list = []
+            token_num_list = []
+            cont_list = []
+
+            # 5.1 Generate prompts of current label and truncate
+            # TODO: Refactor
+            for idx in range(len(ice_idx_list)):
+                prompt = retriever.generate_label_prompt(
+                    idx,
+                    ice[idx],
+                    label,
+                    ice_template=ice_template,
+                    prompt_template=prompt_template)
+                if self.max_seq_len is not None:
+                    prompt_token_num = self.model.get_token_len_from_template(
+                        prompt, mode='ppl')
+                    while len(ice_idx_list[idx]
+                              ) > 0 and prompt_token_num > self.max_seq_len:
+                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
+                        ice[idx] = retriever.generate_ice(
+                            ice_idx_list[idx], ice_template=ice_template)
+                        prompt = retriever.generate_label_prompt(
+                            idx,
+                            ice[idx],
+                            label,
+                            ice_template=ice_template,
+                            prompt_template=prompt_template)
+                        prompt_token_num = self.model.get_token_len_from_template(  # noqa
+                            prompt, mode='ppl')  # noqa
+
+                prompt_list.append(prompt)
+                token_num_list.append(prompt_token_num)
+                cont_list.append(retriever.test_ds[idx]['cont'])
+
+            # 5.2 Get PPL
+            logger.info(f"Calculating PPL for prompts labeled '{label}'")
+            for idx in trange(0,
+                              len(prompt_list),
+                              self.batch_size,
+                              disable=not self.is_main_process):
+                sub_prompt_list = prompt_list[idx:idx + self.batch_size]
+                sub_cont_list = cont_list[idx:idx + self.batch_size]
+
+                with torch.no_grad():
+                    # mainly modify compared to PPLInferencer
+                    sub_res = self.model.get_loglikelihood_from_template(
+                        sub_prompt_list, sub_cont_list).tolist()
+                for res, prompt in zip(
+                        sub_res,
+                        self.model.parse_template(sub_prompt_list,
+                                                  mode='ppl')):
+                    sub_ppl_list.append(res)
+                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
+                    output_handler.save_prompt_and_loglikelihood(
+                        label, prompt.replace(ice_str, ''), prompt, res, index)
+                    index = index + 1
+            ppl.append(sub_ppl_list)
+
+        # 6. Get lowest PPL class as predictions
+        ppl = list(zip(*ppl))
+        for single_ppl in ppl:
+            sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
+        output_handler.save_predictions(sub_predictions)
+
+        # 7. Fetch gold answers if exist
+        ds_reader = retriever.dataset_reader
+        if ds_reader.output_column:
+            golds = ds_reader.dataset['test'][ds_reader.output_column]
+            output_handler.save_golds(golds)
+
+        # 8. Output
+        if self.is_main_process:
+            os.makedirs(output_json_filepath, exist_ok=True)
+            output_handler.write_to_json(output_json_filepath,
+                                         output_json_filename)
+
+        return [
+            sample['prediction']
+            for sample in output_handler.results_dict.values()
+        ]
+
+
+class LoglikelihoodInferencerOutputHandler:
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
+
+    def save_ice(self, ice):
+        for idx, example in enumerate(ice):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['in-context examples'] = example
+
+    def save_predictions(self, predictions):
+        for idx, prediction in enumerate(predictions):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['prediction'] = prediction
+
+    def save_prompt_and_loglikelihood(self, label, input, prompt,
+                                      loglikelihood, idx):
+        if str(idx) not in self.results_dict.keys():
+            self.results_dict[str(idx)] = {}
+        if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
+            self.results_dict[str(idx)]['label: ' + str(label)] = {}
+        self.results_dict[str(idx)]['label: ' +
+                                    str(label)]['testing input'] = input
+        self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
+        self.results_dict[str(idx)][
+            'label: ' + str(label)]['Loglikelihood'] = loglikelihood
+
+    def save_golds(self, golds):
+        for idx, gold in enumerate(golds):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['gold'] = gold
--- a/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
@ -0,0 +1,188 @@
+"""PPL Inferencer."""
+
+import os
+from typing import List, Optional
+
+import mmengine
+import torch
+from tqdm import tqdm
+
+from opencompass.models.base import BaseModel
+from opencompass.registry import ICL_INFERENCERS
+
+from ..icl_prompt_template import PromptTemplate
+from ..icl_retriever import BaseRetriever
+from ..utils import get_logger
+from .icl_base_inferencer import BaseInferencer, dump_results_dict
+
+logger = get_logger(__name__)
+
+
+@ICL_INFERENCERS.register_module()
+class PPLOnlyInferencer(BaseInferencer):
+    """PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
+    made. This Inferencer is usually used along with AveragePPLEvaluator.
+
+    Attributes:
+        model (:obj:`BaseModel`, optional): The module to inference.
+        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
+            the LM.
+        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
+        output_json_filepath (:obj:`str`, optional): File path for output
+            `JSON` file.
+        output_json_filename (:obj:`str`, optional): File name for output
+            `JSON` file.
+        save_every (:obj:`int`, optional): Save intermediate results every
+    """
+
+    def __init__(
+            self,
+            model: BaseModel,
+            max_seq_len: Optional[int] = None,
+            batch_size: Optional[int] = 1,
+            output_json_filepath: Optional[str] = './icl_inference_output',
+            output_json_filename: Optional[str] = 'predictions',
+            save_every: Optional[int] = 1,
+            **kwargs) -> None:
+        super().__init__(
+            model=model,
+            max_seq_len=max_seq_len,
+            batch_size=batch_size,
+            output_json_filename=output_json_filename,
+            output_json_filepath=output_json_filepath,
+            **kwargs,
+        )
+
+        self.save_every = save_every
+
+    def inference(self,
+                  retriever: BaseRetriever,
+                  ice_template: Optional[PromptTemplate] = None,
+                  prompt_template: Optional[PromptTemplate] = None,
+                  output_json_filepath: Optional[str] = None,
+                  output_json_filename: Optional[str] = None) -> List:
+        # 1. Preparation for output logs
+        output_handler = PPLOnlyInferencerOutputHandler()
+
+        if output_json_filepath is None:
+            output_json_filepath = self.output_json_filepath
+        if output_json_filename is None:
+            output_json_filename = self.output_json_filename
+
+        # 2. Get results of retrieval process
+        ice_idx_list = retriever.retrieve()
+
+        # 3. Generate prompts for testing input
+        prompt_list = self.get_generation_prompt_list_from_retriever_indices(
+            ice_idx_list,
+            retriever,
+            max_seq_len=self.max_seq_len,
+            ice_template=ice_template,
+            prompt_template=prompt_template)
+
+        # 3.1 Fetch and zip prompt & gold answer if output column exists
+        ds_reader = retriever.dataset_reader
+
+        assert ds_reader.output_column is None, (
+            'PPLOnlyInferencer supports `output_column=None` only.')
+
+        # Create tmp json file for saving intermediate results and future
+        # resuming
+        index = 0
+        tmp_json_filepath = os.path.join(output_json_filepath,
+                                         'tmp_' + output_json_filename)
+        if os.path.exists(tmp_json_filepath):
+            # TODO: move resume to output handler
+            try:
+                tmp_result_dict = mmengine.load(tmp_json_filepath)
+            except Exception:
+                pass
+            else:
+                output_handler.results_dict = tmp_result_dict
+                index = len(tmp_result_dict)
+
+        # 4. Wrap prompts with Dataloader
+        dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
+
+        # 5. Inference for prompts in each batch
+        logger.info('Starting inference process...')
+        for datum in tqdm(dataloader, disable=not self.is_main_process):
+            entry = datum
+            # 5-1. Inference with local model
+            with torch.no_grad():
+                ppls = self.model.get_ppl_from_template(entry).tolist()
+
+            parsed_entries = self.model.parse_template(entry, mode='gen')
+            # 5-3. Save current output
+            for prompt, ppl, in zip(parsed_entries, ppls):
+                output_handler.save_results(prompt, ppl, index)
+                index = index + 1
+
+            # 5-4. Save intermediate results
+            if (self.save_every is not None and index % self.save_every == 0
+                    and self.is_main_process):
+                output_handler.write_to_json(output_json_filepath,
+                                             'tmp_' + output_json_filename)
+
+        # 6. Output
+        if self.is_main_process:
+            os.makedirs(output_json_filepath, exist_ok=True)
+            output_handler.write_to_json(output_json_filepath,
+                                         output_json_filename)
+            if os.path.exists(tmp_json_filepath):
+                os.remove(tmp_json_filepath)
+
+        return [
+            sample['ppl'] for sample in output_handler.results_dict.values()
+        ]
+
+    def get_generation_prompt_list_from_retriever_indices(
+            self,
+            ice_idx_list: List[List[int]],
+            retriever: BaseRetriever,
+            max_seq_len: Optional[int] = None,
+            ice_template: Optional[PromptTemplate] = None,
+            prompt_template: Optional[PromptTemplate] = None):
+        prompt_list = []
+        for idx, ice_idx in enumerate(ice_idx_list):
+            ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
+            prompt = retriever.generate_prompt_for_generate_task(
+                idx,
+                ice,
+                ice_template=ice_template,
+                prompt_template=prompt_template)
+            if max_seq_len is not None:
+                prompt_token_num = self.model.get_token_len_from_template(
+                    prompt, mode='gen')
+                while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
+                    ice_idx = ice_idx[:-1]
+                    ice = retriever.generate_ice(ice_idx,
+                                                 ice_template=ice_template)
+                    prompt = retriever.generate_prompt_for_generate_task(
+                        idx,
+                        ice,
+                        ice_template=ice_template,
+                        prompt_template=prompt_template)
+                    prompt_token_num = self.model.get_token_len_from_template(
+                        prompt, mode='gen')
+            prompt_list.append(prompt)
+        return prompt_list
+
+
+class PPLOnlyInferencerOutputHandler:
+    origin_prompt_dict = {}
+    output_dict = {}
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
+
+    def save_results(self, origin_prompt, ppl, idx):
+        self.results_dict[str(idx)] = {
+            'origin_prompt': origin_prompt,
+            'ppl': ppl,
+        }
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@ -1,10 +1,12 @@
+import inspect
 from abc import abstractmethod
 from copy import deepcopy
 from typing import Dict, List, Optional

 from mmengine.config import ConfigDict

-from opencompass.utils import get_logger, task_abbr_from_cfg
+from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
+                               model_abbr_from_cfg, task_abbr_from_cfg)


 class BasePartitioner:
@ -54,8 +56,7 @@ class BasePartitioner:
            List[Dict]: A list of tasks.
        """
        cfg = deepcopy(cfg)
-        models = cfg['models']
-        datasets = cfg['datasets']
+
        work_dir = cfg['work_dir']

        add_cfg = {}
@ -74,10 +75,11 @@ class BasePartitioner:
                self.logger.debug(f'Key {k} not found in config, ignored.')
        self.logger.debug(f'Additional config: {add_cfg}')

-        tasks = self.partition(models,
-                               datasets,
-                               work_dir,
-                               self.out_dir,
+        model_and_dataset_args = self.parse_model_dataset_args(cfg)
+
+        tasks = self.partition(**model_and_dataset_args,
+                               work_dir=work_dir,
+                               out_dir=self.out_dir,
                               add_cfg=add_cfg)

        self.logger.info(f'Partitioned into {len(tasks)} tasks.')
@ -86,6 +88,41 @@ class BasePartitioner:

        return tasks

+    def parse_model_dataset_args(self, cfg: ConfigDict):
+        models = cfg['models']
+        datasets = cfg['datasets']
+
+        sig = inspect.signature(self.partition)
+        if 'model_dataset_combinations' in sig.parameters:
+            combs = cfg.get('model_dataset_combinations', None)
+            if combs is None:
+                combs = [{'models': models, 'datasets': datasets}]
+            else:
+                # sanity check
+                model_abbrs = [model_abbr_from_cfg(model) for model in models]
+                dataset_abbrs = [
+                    dataset_abbr_from_cfg(dataset) for dataset in datasets
+                ]
+                for comb in combs:
+                    for model in comb['models']:
+                        if model_abbr_from_cfg(model) not in model_abbrs:
+                            raise ValueError(
+                                f'Model {model_abbr_from_cfg(model)} '
+                                'not found in config.')
+                    for dataset in comb['datasets']:
+                        if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
+                            raise ValueError(
+                                f'Dataset {dataset_abbr_from_cfg(dataset)} '
+                                'not found in config.')
+            used_kwargs = {'model_dataset_combinations': combs}
+        else:
+            if cfg.get('model_dataset_combinations', None) is not None:
+                self.logger.warning(
+                    'model_dataset_combinations is not supported by '
+                    f'{self.__class__.__name__}. Ignored.')
+            used_kwargs = {'models': models, 'datasets': datasets}
+        return used_kwargs
+
    @abstractmethod
    def partition(self,
                  models: List[ConfigDict],
--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
        self.n = n

    def partition(self,
-                  models: List[ConfigDict],
-                  datasets: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[Dict]:
@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
            }

        Args:
-            models (List[ConfigDict]): A list of model configs.
-            datasets (List[ConfigDict]): A list of dataset configs.
+            model_dataset_combinations (List[Dict]): List of
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
+                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@ -60,20 +61,21 @@ class NaivePartitioner(BasePartitioner):
        """

        tasks = []
-        for model in models:
-            chunks = []
-            for dataset in datasets:
-                filename = get_infer_output_path(model, dataset, out_dir)
-                if osp.exists(filename):
-                    continue
-                chunks.append(dataset)
+        for comb in model_dataset_combinations:
+            for model in comb['models']:
+                chunks = []
+                for dataset in comb['datasets']:
+                    filename = get_infer_output_path(model, dataset, out_dir)
+                    if osp.exists(filename):
+                        continue
+                    chunks.append(dataset)

-            for i in range(0, len(chunks), self.n):
-                task = Config({
-                    'models': [model],
-                    'datasets': [chunks[i:i + self.n]],
-                    'work_dir': work_dir,
-                    **add_cfg
-                })
-                tasks.append(task)
+                for i in range(0, len(chunks), self.n):
+                    task = Config({
+                        'models': [model],
+                        'datasets': [chunks[i:i + self.n]],
+                        'work_dir': work_dir,
+                        **add_cfg
+                    })
+                    tasks.append(task)
        return tasks
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
        self.strategy = strategy

    def partition(self,
-                  models: List[ConfigDict],
-                  datasets: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[ConfigDict]:
@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
            }

        Args:
-            models (List[ConfigDict]): A list of model configs.
-            datasets (List[ConfigDict]): A list of dataset configs.
+            model_dataset_combinations (List[Dict]): List of
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
+                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@ -84,52 +85,54 @@ class SizePartitioner(BasePartitioner):
            List[ConfigDict]: A list of tasks.
        """

-        datasets = sorted(datasets,
-                          key=lambda x: self.get_cost(x),
-                          reverse=True)
        tasks = []
-        for model in models:
-            chunks = []  # elements: tuple(size, dataset_chunk)
-            for dataset in datasets:
-                filename = get_infer_output_path(model, dataset, out_dir)
-                # skip the task if the task output exists
-                if osp.exists(filename):
-                    continue
-                dataset_size = self.get_cost(dataset)
-                if dataset_size > self.max_task_size:
-                    root, ext = osp.splitext(filename)
-                    dataset_splits = self.split_dataset(dataset)
-                    for i, dataset_split in enumerate(dataset_splits):
-                        if not osp.exists(f'{root}_{i}{ext}'):
-                            chunks.append((self.max_task_size, dataset_split))
-                else:
-                    chunks.append((dataset_size, dataset))
+        for comb in model_dataset_combinations:
+            comb['datasets'] = sorted(comb['datasets'],
+                                      key=lambda x: self.get_cost(x),
+                                      reverse=True)
+            for model in comb['models']:
+                chunks = []  # elements: tuple(size, dataset_chunk)
+                for dataset in comb['datasets']:
+                    filename = get_infer_output_path(model, dataset, out_dir)
+                    # skip the task if the task output exists
+                    if osp.exists(filename):
+                        continue
+                    dataset_size = self.get_cost(dataset)
+                    if dataset_size > self.max_task_size:
+                        root, ext = osp.splitext(filename)
+                        dataset_splits = self.split_dataset(dataset)
+                        for i, dataset_split in enumerate(dataset_splits):
+                            if not osp.exists(f'{root}_{i}{ext}'):
+                                chunks.append(
+                                    (self.max_task_size, dataset_split))
+                    else:
+                        chunks.append((dataset_size, dataset))

-            if self.strategy == 'heuristic':
-                chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
-                current_size, current_chunks = 0, []
-                for index in range(len(chunks)):
-                    current_size += chunks[index][0]
-                    current_chunks.append(chunks[index][1])
-                    if index == len(chunks) - 1 or current_size + chunks[
-                            index + 1][0] > self.max_task_size:
+                if self.strategy == 'heuristic':
+                    chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
+                    current_size, current_chunks = 0, []
+                    for index in range(len(chunks)):
+                        current_size += chunks[index][0]
+                        current_chunks.append(chunks[index][1])
+                        if index == len(chunks) - 1 or current_size + chunks[
+                                index + 1][0] > self.max_task_size:
+                            tasks.append(
+                                Config({
+                                    'models': [model],
+                                    'datasets': [current_chunks],
+                                    'work_dir': work_dir,
+                                    **add_cfg
+                                }))
+                            current_size, current_chunks = 0, []
+                elif self.strategy == 'split':
+                    for _, dataset in chunks:
                        tasks.append(
                            Config({
                                'models': [model],
-                                'datasets': [current_chunks],
+                                'datasets': [[dataset]],
                                'work_dir': work_dir,
                                **add_cfg
                            }))
-                        current_size, current_chunks = 0, []
-            elif self.strategy == 'split':
-                for _, dataset in chunks:
-                    tasks.append(
-                        Config({
-                            'models': [model],
-                            'datasets': [[dataset]],
-                            'work_dir': work_dir,
-                            **add_cfg
-                        }))
        return tasks

    @property
--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
 from tqdm import tqdm

 from opencompass.registry import RUNNERS, TASKS
-from opencompass.utils import get_logger
+from opencompass.utils import batched, get_logger

 from .base import BaseRunner

@ -131,15 +131,22 @@ class SlurmSequentialRunner(BaseRunner):
                        break
                parent_conn.close()

-            for job_id in tqdm(job_ids, desc='clear sruns'):
-                if job_id is None:
-                    continue
-                cmd = f'scancel {job_id}'
-                p = subprocess.Popen(cmd,
-                                     shell=True,
-                                     stdout=subprocess.PIPE,
-                                     stderr=subprocess.STDOUT)
-                p.wait()
+            tbar = tqdm(total=len(job_ids), desc='clear sruns')
+            for batched_job_ids in batched(job_ids, 4):
+                ps = []
+                for job_id in batched_job_ids:
+                    tbar.update()
+                    if job_id is None:
+                        continue
+                    cmd = f'scancel {job_id}'
+                    p = subprocess.Popen(cmd,
+                                         shell=True,
+                                         stdout=subprocess.PIPE,
+                                         stderr=subprocess.STDOUT)
+                    ps.append(p)
+                for p in ps:
+                    p.wait()
+            tbar.close()

    def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
        logger = get_logger()
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
            pred_dicts = copy.deepcopy(preds)
            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}

-            pred_strs = preds.pop('prediction')
-            pred_list_flag = isinstance(pred_strs[0], list)
+            pred_strs = preds.pop('prediction', None)
+            pred_list_flag = pred_strs is not None and isinstance(
+                pred_strs[0], list)
            if ('pred_role' in self.eval_cfg
                    and 'meta_template' in self.model_cfg
                    and not MODELS.get(self.model_cfg['type']).is_api):
@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
                ]

            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
+            # need results dir to save other files
+            out_path = get_infer_output_path(
+                self.model_cfg, self.dataset_cfg,
+                osp.join(self.work_dir, 'results'))
+            icl_evaluator._out_dir = osp.splitext(out_path)[
+                0]  # strip extension

            preds['predictions'] = pred_strs
            preds['references'] = (test_set[self.output_column]
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
    return ''


+@TEXT_POSTPROCESSORS.register_module('last-capital')
+def last_capital_postprocess(text: str) -> str:
+    for t in text[::-1]:
+        if t.isupper():
+            return t
+    return ''
+
+
 def first_option_postprocess(text: str, options: str) -> str:
    """Find first valid option for text."""

--- a/requirements/agent.txt
+++ b/requirements/agent.txt
@ -0,0 +1,7 @@
+json5
+jupyter
+jupyter_client
+jupytext
+lagent
+scikit-image
+sympy
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -1,4 +1 @@
 faiss_gpu==1.7.2
-jupyter
-lagent
-scikit-image