[Sync] update (#517)

2025-05-30 16:03:24 +08:00 · 2023-10-27 20:31:22 +08:00 · 2023-10-27 20:31:22 +08:00 · dbb20b8270
commit dbb20b8270
parent 6f07af3039
45 changed files with 580 additions and 89 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,7 +11,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
-
+configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -86,3 +86,6 @@ docs/zh_cn/_build/

 # .zip
 *.zip
+
+# sft config ignore list
+configs/sft_cfg/*B_*
--- a/configs/datasets/agieval/agieval_gen_397d81.py
+++ b/configs/datasets/agieval/agieval_gen_397d81.py
@ -0,0 +1,204 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
+from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
+
+agieval_reader_cfg = dict(
+    input_columns=['question', 'options'], output_column='label')
+
+agieval_single_choice_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+    'logiqa-zh',
+    'lsat-ar',
+    'lsat-lr',
+    'lsat-rc',
+    'logiqa-en',
+    'sat-math',
+    'sat-en',
+    'sat-en-without-passage',
+    'aqua-rat',
+]
+agieval_multiple_choices_sets = [
+    'jec-qa-kd',
+    'jec-qa-ca',
+]
+agieval_cloze_sets = ['gaokao-mathcloze', 'math']
+agieval_chinese_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+    'logiqa-zh',
+    'gaokao-mathcloze',
+]
+agieval_english_sets = [
+    'lsat-ar',
+    'lsat-lr',
+    'lsat-rc',
+    'logiqa-en',
+    'sat-math',
+    'sat-en',
+    'sat-en-without-passage',
+    'aqua-rat',
+    'math',
+]
+agieval_gaokao_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+]
+
+agieval_datasets = []
+for _name in agieval_single_choice_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _name in agieval_multiple_choices_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess_multi))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _name in agieval_cloze_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _item in agieval_datasets:
+    _name = _item['name']
+    _intro = {
+        'gaokao-chinese':
+        '以下是一道中国高考语文选择题，请选择正确的答案。',
+        'gaokao-english':
+        '以下是一道中国高考英语选择题，请选择正确的答案。',
+        'gaokao-geography':
+        '以下是一道中国高考地理选择题，请选择正确的答案。',
+        'gaokao-history':
+        '以下是一道中国高考历史选择题，请选择正确的答案。',
+        'gaokao-biology':
+        '以下是一道中国高考生物选择题，请选择正确的答案。',
+        'gaokao-chemistry':
+        '以下是一道中国高考化学选择题，请选择正确的答案。',
+        'gaokao-physics':
+        '以下是一道中国高考物理选择题，请选择正确的答案。',
+        'gaokao-mathqa':
+        '以下是一道中国高考数学选择题，请选择正确的答案。',
+        'logiqa-zh':
+        '以下是一道中国公务员考试题，请选择正确的答案。',
+        'lsat-ar':
+        'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
+        'lsat-lr':
+        'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
+        'lsat-rc':
+        'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
+        'logiqa-en':
+        'The following is a Logic Reasoning question. Please select the correct answer.',
+        'sat-math':
+        'The following is a SAT Math question. Please select the correct answer.',
+        'sat-en':
+        'The following is a SAT English question. Please select the correct answer.',
+        'sat-en-without-passage':
+        'The following is a SAT English question. Please select the correct answer.',
+        'aqua-rat':
+        'The following is a AQUA-RAT question. Please select the correct answer.',
+        'jec-qa-kd':
+        '以下是一道中国司法考试基础知识题，请选择正确的答案。',
+        'jec-qa-ca':
+        '以下是一道中国司法考试案例分析题，请选择正确的答案。',
+        'gaokao-mathcloze':
+        '以下是一道中国高考数学填空题，请填入正确的答案。',
+        'math':
+        'The following is a Math question. Please select the correct answer.',
+    }[_name]
+    _templates = _item['infer_cfg']['prompt_template']['template']
+    _templates['round'][0][
+        'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
+
+del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
--- a/configs/datasets/agieval/agieval_mixed_2f14ad.py
+++ b/configs/datasets/agieval/agieval_mixed_2f14ad.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
+from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
 from opencompass.utils.text_postprocessors import first_capital_postprocess_multi

 agieval_single_choice_sets = [
@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
        inferencer=dict(type=GenInferencer, max_out_len=1024))

    agieval_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=AGIEvalEvaluator_mcq),
        pred_postprocessor=dict(type=first_capital_postprocess_multi))

    agieval_datasets.append(
--- a/configs/datasets/bbh/bbh_gen_5b92b0.py
+++ b/configs/datasets/bbh/bbh_gen_5b92b0.py
@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq

 bbh_reader_cfg = dict(input_columns=["input"], output_column="target")

@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=BBHEvaluator_mcq),
        pred_role="BOT",
        pred_postprocessor=dict(type=bbh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
--- a/configs/datasets/bbh/bbh_gen_5bf00b.py
+++ b/configs/datasets/bbh/bbh_gen_5bf00b.py
@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq

 bbh_reader_cfg = dict(input_columns=["input"], output_column="target")

@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=BBHEvaluator_mcq),
        pred_role="BOT",
        pred_postprocessor=dict(type=bbh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
--- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator

 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')

@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))

-gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                      pred_postprocessor=dict(type=gsm8k_postprocess),
                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

--- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator

 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')

@ -72,7 +72,7 @@ Question: {question}{answer}
    inferencer=dict(type=GenInferencer, max_out_len=512))

 gsm8k_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=Gsm8kEvaluator),
    pred_postprocessor=dict(type=gsm8k_postprocess),
    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

--- a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import SCInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator

 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' )
 generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40)
@ -73,7 +73,7 @@ Question: {question}{answer}
    inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20))

 gsm8k_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=Gsm8kEvaluator),
    pred_postprocessor=dict(type=gsm8k_postprocess),
    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
    sc_size = 20)
--- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator

 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')

@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))

-gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                      pred_role="BOT",
                      pred_postprocessor=dict(type=gsm8k_postprocess),
                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
--- a/configs/models/claude/claude.py
+++ b/configs/models/claude/claude.py
@ -1,6 +1,8 @@
 from opencompass.models.claude_api.claude_api import Claude
+from opencompass.models.claude_api.postprocessors import (
+    flores_postprocess, gsm8k_postprocess, humaneval_postprocess,
+    lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess)
 from opencompass.utils.text_postprocessors import last_option_postprocess
-from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess

 agieval_single_choice_sets = [
    'gaokao-chinese',
@ -47,6 +49,8 @@ claude_postprocessors = {
    'lcsts': dict(type=lcsts_postprocess),
    'mbpp': dict(type=mbpp_postprocess),
    'strategyqa': dict(type=strategyqa_pred_postprocess),
+    'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'),
+    'flores_100_*-zho_simpl': dict(type=flores_postprocess),
 }

 for _name in agieval_multiple_choices_sets + agieval_single_choice_sets:
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@ -5,7 +5,7 @@
 The program entry for the evaluation task is `run.py`. The usage is as follows:

 ```shell
-python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
+python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
 ```

 Task Configuration (`$EXP`):
@ -66,6 +66,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
+- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.

 Using run mode `-m all` as an example, the overall execution flow is as follows:

--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@ -5,7 +5,7 @@
 评测任务的程序入口为 `run.py`，使用方法如下：

 ```shell
-python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
+python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
 ```

 任务配置 (`$EXP`)：
@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
+- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。

 以运行模式 `-m all` 为例，整体运行流如下：

--- a/opencompass/datasets/afqmcd.py
+++ b/opencompass/datasets/afqmcd.py
@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                line['label'] = 'AB'[int(line['label'])]
--- a/opencompass/datasets/agieval/agieval.py
+++ b/opencompass/datasets/agieval/agieval.py
@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):

    def score(self, predictions, references):
        predictions = [parse_math_answer('', pred) for pred in predictions]
+        details = []
        cnt = 0
        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
            if is_equiv(pred, ref):
                cnt += 1
+                detail['correct'] = True
+            details.append(detail)
        score = cnt / len(predictions) * 100
-        return {'score': score}
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class AGIEvalEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        details = []
+        cnt = 0
+        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            if pred == ref:
+                cnt += 1
+                detail['correct'] = True
+            details.append(detail)
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score, 'details': details}
--- a/opencompass/datasets/bbh.py
+++ b/opencompass/datasets/bbh.py
@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):

        predictions = [bbh_freeform_postprocess(pred) for pred in predictions]

+        details = []
        cnt = 0
        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
            if pred == ref:
                cnt += 1
+                detail['correct'] = True
+            details.append(detail)

        score = cnt / len(predictions) * 100

-        return {'score': score}
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class BBHEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        details = []
+        cnt = 0
+        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            if pred == ref:
+                cnt += 1
+                detail['correct'] = True
+            details.append(detail)
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score, 'details': details}
--- a/opencompass/datasets/bustum.py
+++ b/opencompass/datasets/bustum.py
@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                line['label'] = 'AB'[int(line['label'])]
--- a/opencompass/datasets/c3.py
+++ b/opencompass/datasets/c3.py
@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
    @staticmethod
    def load(path: str):

-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        rows = []
        for _, row in enumerate(data):
@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):

    @staticmethod
    def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            raw = json.load(f)
        data = []
        for line in raw:
--- a/opencompass/datasets/ceval.py
+++ b/opencompass/datasets/ceval.py
@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
    def load(path: str, name: str):
        dataset = {}
        for split in ['dev', 'val', 'test']:
-            with open(osp.join(path, split, f'{name}_{split}.csv')) as f:
+            filename = osp.join(path, split, f'{name}_{split}.csv')
+            with open(filename, encoding='utf-8') as f:
                reader = csv.reader(f)
                header = next(reader)
                for row in reader:
--- a/opencompass/datasets/chid.py
+++ b/opencompass/datasets/chid.py
@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {}
--- a/opencompass/datasets/cluewsc.py
+++ b/opencompass/datasets/cluewsc.py
@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {
--- a/opencompass/datasets/cmb.py
+++ b/opencompass/datasets/cmb.py
@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):

    @staticmethod
    def load(path: str):
-        with open(osp.join(path, 'test.json'), 'r') as f:
+        with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
            test_data = json.load(f)
-        with open(osp.join(path, 'val.json'), 'r') as f:
+        with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
            val_data = json.load(f)

        for da in test_data:
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                if line['label'] == '-':
--- a/opencompass/datasets/cmrc.py
+++ b/opencompass/datasets/cmrc.py
@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):

    @staticmethod
    def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # 将原始数据转换为所需的格式
        rows = []
--- a/opencompass/datasets/copa.py
+++ b/opencompass/datasets/copa.py
@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                line['label'] = 'AB'[line['label']]
--- a/opencompass/datasets/csl.py
+++ b/opencompass/datasets/csl.py
@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {
--- a/opencompass/datasets/drcd.py
+++ b/opencompass/datasets/drcd.py
@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):

    @staticmethod
    def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # 将原始数据转换为所需的格式
        rows = []
--- a/opencompass/datasets/eprstmt.py
+++ b/opencompass/datasets/eprstmt.py
@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@ -1,3 +1,4 @@
+from opencompass.openicl import BaseEvaluator
 from opencompass.registry import TEXT_POSTPROCESSORS


@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
        if ret[i].isdigit():
            ret1 += ret[i]
    return ret1
+
+
+class Gsm8kEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        correct = 0
+        count = 0
+        details = []
+        for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answers': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
--- a/opencompass/datasets/hellaswag.py
+++ b/opencompass/datasets/hellaswag.py
@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
    @staticmethod
    def load(path):
        dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                dataset.append({
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
            }
        correct = 0
        count = 0
+        details = []
        for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answer': j, 'correct': False}
            count += 1
            if self.is_equiv(i, j):
                correct += 1
-        result = {'accuracy': 100 * correct / count}
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
        return result

    def _fix_fracs(self, string):
--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]

+        details = []
        cnt = 0
        for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
            cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
        score = cnt / len(predictions) * 100

-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/datasets/tnews.py
+++ b/opencompass/datasets/tnews.py
@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
        }

        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {
--- a/opencompass/datasets/triviaqa.py
+++ b/opencompass/datasets/triviaqa.py
@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]

+        details = []
        cnt = 0
        for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
            cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
        score = cnt / len(predictions) * 100

-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/models/claude_api/postprocessors.py
+++ b/opencompass/models/claude_api/postprocessors.py
@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
    return ''


+def flores_postprocess(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    return text
+
+
+def flores_postprocess_chinese(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    import jieba
+    truncated_text = text.strip().split('\n')[0]
+    cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
+    cleaned_text = ' '.join(jieba.cut(cleaned_text))
+    return cleaned_text
+
+
 def record_postprocess(text: str) -> str:
    match = re.search(r'(?<=refers to )[^.]+', text)

--- a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
                             for i in references]

        cnt = 0
+        details = []
        for pred, ans, origin_ans in zip(predictions, processed_answers,
                                         references):
+            answers = list(set(ans + origin_ans))
+            detail = {'pred': pred, 'answer': answers}
            if pred in ans or pred in origin_ans:
                cnt += 1
+                detail['correct'] = True
+            else:
+                detail['correct'] = False
+            details.append(detail)

        score = cnt / len(predictions) * 100

-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
@ -51,8 +51,7 @@ class BaseInferencer:
        self.output_json_filepath = output_json_filepath
        self.output_json_filename = output_json_filename
        self.is_main_process = is_main_process()
-        if not os.path.exists(self.output_json_filepath):
-            os.makedirs(self.output_json_filepath)
+        os.makedirs(self.output_json_filepath, exist_ok=True)

    def inference(self,
                  retriever: BaseRetriever,
--- a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
            index = 0
            prompt_list = []
            sub_ppl_list = []
+            token_num_list = []
            normalizing_prompt_list = []
            context_length_list = []

@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
                                                               mode='ppl'))
                    normalizing_prompt_list.append(normalizing_prompt)
                prompt_list.append(prompt)
+                token_num_list.append(prompt_token_num)

            if normalizing_str is not None:
                normalizing_str_len = self.model.get_token_len_from_template(
@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
                    output_handler.save_prompt_and_ppl(
                        label, prompt.replace(ice_str, ''), prompt, res, index)
+                    output_handler.results_dict[str(
+                        index)][f'label: {str(label)}'][
+                            'BPB'] = res * token_num_list[idx] / len(
+                                prompt.replace(ice_str, '').encode())
                    index = index + 1
            ppl.append(sub_ppl_list)

--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@ -1,6 +1,6 @@
 from abc import abstractmethod
 from copy import deepcopy
-from typing import Dict, List
+from typing import Dict, List, Optional

 from mmengine.config import ConfigDict

@ -13,16 +13,24 @@ class BasePartitioner:

    Args:
        out_dir (str): The output directory of tasks.
-        keep_keys (List[str]): The keys to be kept from the experiment config
-            to the task config.
+        keep_keys (Optional[List[str]], optional): The keys to be kept from the
+            experiment config to the task config. Defaults to None. If None,
+            the following keys will be kept:
+
+            - eval.runner.task.judge_cfg
+            - eval.runner.task.dump_details
    """

-    def __init__(self,
-                 out_dir: str,
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+    def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
        self.logger = get_logger()
        self.out_dir = out_dir
-        self.keep_keys = keep_keys
+        if keep_keys is None:
+            self.keep_keys = [
+                'eval.runner.task.judge_cfg',
+                'eval.runner.task.dump_details',
+            ]
+        else:
+            self.keep_keys = keep_keys

    def __call__(self, cfg: ConfigDict) -> List[Dict]:
        """Generate tasks from config. Each task is defined as a
@ -63,7 +71,8 @@ class BasePartitioner:
                    tgt_ptr = tgt_ptr[key]
                tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
            except Exception:
-                self.logger.warning(f'Key {k} not found in config, ignored.')
+                self.logger.debug(f'Key {k} not found in config, ignored.')
+        self.logger.debug(f'Additional config: {add_cfg}')

        tasks = self.partition(models,
                               datasets,
--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
@ -1,5 +1,5 @@
 import os.path as osp
-from typing import Dict, List
+from typing import Dict, List, Optional

 from mmengine.config import Config, ConfigDict

@ -11,15 +11,23 @@ from .base import BasePartitioner

@PARTITIONERS.register_module()
 class NaivePartitioner(BasePartitioner):
-    """Naive task partitioner. This partitioner will generate a task for each
-    model-dataset pair.
+    """Naive task partitioner. This partitioner will generate a task for each n
+    model-dataset pairs.

    Args:
        out_dir (str): The output directory of tasks.
+        n (int): The number of model-dataset pairs in each task.
        keep_keys (List[str]): The keys to be kept from the experiment config
            to the task config.
    """

+    def __init__(self,
+                 out_dir: str,
+                 n: int = 1,
+                 keep_keys: Optional[List[str]] = None):
+        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
+        self.n = n
+
    def partition(self,
                  models: List[ConfigDict],
                  datasets: List[ConfigDict],
@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):

        tasks = []
        for model in models:
+            chunks = []
            for dataset in datasets:
                filename = get_infer_output_path(model, dataset, out_dir)
                if osp.exists(filename):
                    continue
+                chunks.append(dataset)
+
+            for i in range(0, len(chunks), self.n):
                task = Config({
                    'models': [model],
-                    'datasets': [[dataset]],
+                    'datasets': [chunks[i:i + self.n]],
                    'work_dir': work_dir,
                    **add_cfg
                })
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -2,7 +2,7 @@ import copy
 import math
 import os.path as osp
 from fnmatch import fnmatch
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union

 import mmengine
 from mmengine.config import Config, ConfigDict
@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
        max_task_size (int): The maximum size of a task.
        gen_task_coef (int): The dataset cost measurement coefficient for
            generation tasks.
+        strategy (str): The partition strategy. Supported strategies are:
+            'heuristic' and 'split'. Defaults to 'heuristic'.
+            heuristic: split large datasets into several tasks, merge small
+                datasets into one task.
+            split: split large datasets into several tasks only.
        dataset_size_path (str): The path to the dataset size cache file.
        keep_keys (list[str]): The keys to be kept from the experiment config
            to the task config.
@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
                 out_dir: str,
                 max_task_size: int = 40000,
                 gen_task_coef: int = 20,
+                 strategy: str = 'heuristic',
                 dataset_size_path: str = '.cache/dataset_size.json',
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+                 keep_keys: Optional[List[str]] = None):
        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
        self.max_task_size = max_task_size
        self.gen_task_coef = gen_task_coef
        self.dataset_size_path = dataset_size_path
+        assert strategy in ('heuristic', 'split'), \
+            f'Unsupported partition strategy: {strategy}. '\
+            'Supported strategies are: `heuristic`, `split` .'
+        self.strategy = strategy

    def partition(self,
                  models: List[ConfigDict],
@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
                          reverse=True)
        tasks = []
        for model in models:
-            task = Config({
-                'models': [model],
-                'datasets': [[]],
-                'work_dir': work_dir,
-                **add_cfg
-            })
-            num_data = 0
+            chunks = []  # elements: tuple(size, dataset_chunk)
            for dataset in datasets:
                filename = get_infer_output_path(model, dataset, out_dir)
-                root, ext = osp.splitext(filename)
                # skip the task if the task output exists
                if osp.exists(filename):
                    continue
                dataset_size = self.get_cost(dataset)
                if dataset_size > self.max_task_size:
+                    root, ext = osp.splitext(filename)
                    dataset_splits = self.split_dataset(dataset)
                    for i, dataset_split in enumerate(dataset_splits):
-                        # skip the task it the task output exists
                        if not osp.exists(f'{root}_{i}{ext}'):
-                            tasks.append(
-                                Config({
-                                    'models': [model],
-                                    'datasets': [[dataset_split]],
-                                    'work_dir': work_dir,
-                                    **add_cfg
-                                }))
+                            chunks.append((self.max_task_size, dataset_split))
                else:
-                    if num_data + dataset_size > self.max_task_size:
-                        tasks.append(task)
-                        task = Config({
+                    chunks.append((dataset_size, dataset))
+
+            if self.strategy == 'heuristic':
+                chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
+                current_size, current_chunks = 0, []
+                for index in range(len(chunks)):
+                    current_size += chunks[index][0]
+                    current_chunks.append(chunks[index][1])
+                    if index == len(chunks) - 1 or current_size + chunks[
+                            index + 1][0] > self.max_task_size:
+                        tasks.append(
+                            Config({
+                                'models': [model],
+                                'datasets': [current_chunks],
+                                'work_dir': work_dir,
+                                **add_cfg
+                            }))
+                        current_size, current_chunks = 0, []
+            elif self.strategy == 'split':
+                for _, dataset in chunks:
+                    tasks.append(
+                        Config({
                            'models': [model],
-                            'datasets': [[]],
+                            'datasets': [[dataset]],
                            'work_dir': work_dir,
                            **add_cfg
-                        })
-                        num_data = 0
-                    task['datasets'][0].append(dataset)
-                    num_data = num_data + dataset_size
-            if task['datasets'][0]:
-                tasks.append(task)
-
+                        }))
        return tasks

    @property
--- a/opencompass/partitioners/sub_naive.py
+++ b/opencompass/partitioners/sub_naive.py
@ -23,7 +23,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
                 mode: str,
                 out_dir: str,
                 model_pairs: Optional[List[Tuple]] = None,
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+                 keep_keys: Optional[List[str]] = None):
        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
        assert mode in ['all', 'one_to_n', 'fixed']
        self.mode = mode
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@ -72,6 +72,7 @@ class DefaultSummarizer:
                if not osp.exists(filepath):
                    continue
                result = mmengine.load(filepath)
+                result.pop('details', None)
                raw_results[model_abbr][dataset_abbr] = result
                if 'error' in result:
                    self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -1,11 +1,14 @@
 import argparse
+import copy
 import fnmatch
+import math
 import os.path as osp
+import statistics
 import time
 from collections import Counter
 from inspect import signature
 from shutil import which
-from typing import Optional
+from typing import List, Optional

 import mmengine
 from mmengine.config import Config, ConfigDict
@ -35,6 +38,8 @@ class OpenICLEvalTask(BaseTask):
        super().__init__(cfg)
        self.num_gpus = 0
        self.logger = get_logger()
+        self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
+            'task', {}).get('dump_details', False)

    def get_command(self, cfg_path, template):
        script_path = __file__
@ -113,7 +118,7 @@ class OpenICLEvalTask(BaseTask):
                        [sub_preds[str(i)] for i in range(len(sub_preds))])
                    filename = root + f'_{i}' + ext
                    i += 1
-
+            pred_dicts = copy.deepcopy(preds)
            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}

            pred_strs = preds.pop('prediction')
@ -163,6 +168,7 @@ class OpenICLEvalTask(BaseTask):
                ]

            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
+
            preds['predictions'] = pred_strs
            preds['references'] = (test_set[self.output_column]
                                   if self.output_column else None)
@ -172,18 +178,42 @@ class OpenICLEvalTask(BaseTask):
            }
            result = icl_evaluator.score(**preds)

+            if self.dump_details:
+                try:
+                    details = result.pop('details', None)
+                    result['details'] = self.format_details(
+                        pred_strs, test_set[self.output_column], details,
+                        pred_dicts)
+                    result['type'] = result['details'].pop('type', None)
+
+                    if 'PPL' in str(
+                            self.dataset_cfg.infer_cfg.inferencer.type):
+                        result['correct_bpb'], result[
+                            'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
+                    else:
+                        result['incorrect_bpb'] = result['correct_bpb'] = -1
+                except Exception:
+                    result['incorrect_bpb'] = result['correct_bpb'] = -1
+            else:
+                result.pop('details', None)
+
        if 'error' in result:
            self.logger.error(
                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
            return
        else:
-            self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
+            result_wo_details = {
+                i: result[i]
+                for i in result if i != 'details'
+            }
+            self.logger.info(
+                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')

        # Save result
        out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
                                         osp.join(self.work_dir, 'results'))
        mkdir_or_exist(osp.split(out_path)[0])
-        mmengine.dump(result, out_path)
+        mmengine.dump(result, out_path, ensure_ascii=False, indent=4)

    def _extract_role_pred(self, s: str, begin_str: Optional[str],
                           end_str: Optional[str]) -> str:
@ -215,6 +245,95 @@ class OpenICLEvalTask(BaseTask):

        return s[start:end]

+    def format_details(self, predictions, references, details, pred_dicts):
+        """This function is responsible for formatting prediction details.
+
+        Args:
+            predictions (list): The prediction list.
+            references (list): The reference list.
+            details (list): Contains the 'pred' 'answer' and 'correct' for each
+                sample. Such as `[{'pred': '光荣和ωforce',
+                'answers': ['光荣和ω-force', '光荣和ωforce'], 'correct': True}]`
+            pred_dicts (list): Contains a list of samples with the original
+                prompts. Such as
+                `[{'origin_prompt': '根据文章回答问题。你的答案应该尽可能3》…………',
+                'prediction': ' 光荣和ω-force\n', 'gold': ['光荣和ω-force']}]`
+
+        Returns:
+            list: The formatted prediction details.
+        """
+        results = {}
+        for i in range(len(predictions)):
+            ppl_flag = False
+            result = {}
+            origin_prediction = copy.deepcopy(pred_dicts[i])
+            origin_prediction.pop('in-context examples', None)
+            origin_prediction.pop('prediction', None)
+            keys = copy.deepcopy(list(origin_prediction.keys()))
+            for key in keys:
+                if key.startswith('label:'):
+                    ppl_flag = True
+                    origin_prediction[key].pop('testing input', None)
+                    new_key = key.replace('label: ', '')
+                    origin_prediction[new_key] = origin_prediction.pop(key)
+            if ppl_flag:
+                results['type'] = 'PPL'
+                result['origin_prediction'] = origin_prediction
+                result['predictions'] = str(predictions[i])
+                result['references'] = str(references[i])
+                result['correct'] = str(predictions[i]) == str(references[i])
+            else:
+                results['type'] = 'GEN'
+                result['prompt'] = origin_prediction['origin_prompt']
+                result['origin_prediction'] = pred_dicts[i]['prediction']
+                result['predictions'] = details[i]['pred']
+                result['references'] = details[i]['answers']
+                result['correct'] = details[i]['correct']
+            results[str(i)] = result
+        return results
+
+    def calculate_bpb(self, pred_dicts: List):
+        """This function is used to calculate the BPB (Bits Per Byte) for the
+        data. The correct BPB is obtained directly from the values in the
+        'predictions' file. The incorrect BPB is the average of the remaining
+        BPB values for each sample under different labels after subtracting the
+        correct BPB. The calculation of BPB (Bits Per Byte) is similar to PPL,
+        with the difference that it computes the additional bits needed on
+        average, in terms of character length, to encode the true sequence
+        based on the predictions. This calculation involves applying a
+        weighting factor based on the ratio of words to characters.
+
+        Args:
+            pred_dicts (list): Contains a list of samples with each options
+                and BPB scores.
+
+        Returns:
+            dict: Contains correct and incorrect bpb.
+        """
+        incorrect_bpb_list = []
+        bpb_list = []
+        for pred_dict in pred_dicts:
+            preds = {
+                key: value
+                for key, value in pred_dict.items()
+                if key.startswith('label: ')
+            }
+            values = []
+            for item in preds.items():
+                values.append(item[1])
+            bpbs = [value['BPB'] for value in values]
+            incorrect_bpb_list.append(
+                (sum(bpbs) - min(bpbs)) / (len(bpbs) - 1))
+            bpb_list.append(statistics.mean(bpbs))
+
+        def filters(origins):
+            targets = [target for target in origins if not math.isnan(target)]
+            return targets
+
+        mean_incorrect = statistics.mean(filters(incorrect_bpb_list))
+        mean_correct = statistics.mean(filters(bpb_list))
+        return 100 * mean_correct, 100 * mean_incorrect
+

 def parse_args():
    parser = argparse.ArgumentParser(description='Score Calculator')
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -25,6 +25,7 @@ requests==2.31.0
 rouge
 rouge_chinese
 rouge_score
+sacrebleu
 scikit_learn==1.2.1
 seaborn
 sentence_transformers==2.2.2
--- a/run.py
+++ b/run.py
@ -123,6 +123,12 @@ def parse_args():
        'Will be overrideen by the "retry" argument in the config.',
        type=int,
        default=2)
+    parser.add_argument(
+        '--dump-eval-details',
+        help='Whether to dump the evaluation details, including the '
+        'correctness of each sample, bpb, etc.',
+        action='store_true',
+    )
    # set srun args
    slurm_parser = parser.add_argument_group('slurm_args')
    parse_slurm_args(slurm_parser)
@ -300,6 +306,8 @@ def main():

        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            fill_eval_cfg(cfg, args)
+        if args.dump_eval_details:
+            cfg.eval.runner.task.dump_details = True

        if args.partition is not None:
            if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: