diff --git a/.gitignore b/.gitignore
index 23bf2a52..f2eab368 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
-
+configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -86,3 +86,6 @@ docs/zh_cn/_build/
 
 # .zip
 *.zip
+
+# sft config ignore list
+configs/sft_cfg/*B_*
diff --git a/configs/datasets/agieval/agieval_gen_397d81.py b/configs/datasets/agieval/agieval_gen_397d81.py
new file mode 100644
index 00000000..523cb074
--- /dev/null
+++ b/configs/datasets/agieval/agieval_gen_397d81.py
@@ -0,0 +1,204 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
+from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
+
+agieval_reader_cfg = dict(
+    input_columns=['question', 'options'], output_column='label')
+
+agieval_single_choice_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+    'logiqa-zh',
+    'lsat-ar',
+    'lsat-lr',
+    'lsat-rc',
+    'logiqa-en',
+    'sat-math',
+    'sat-en',
+    'sat-en-without-passage',
+    'aqua-rat',
+]
+agieval_multiple_choices_sets = [
+    'jec-qa-kd',
+    'jec-qa-ca',
+]
+agieval_cloze_sets = ['gaokao-mathcloze', 'math']
+agieval_chinese_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+    'logiqa-zh',
+    'gaokao-mathcloze',
+]
+agieval_english_sets = [
+    'lsat-ar',
+    'lsat-lr',
+    'lsat-rc',
+    'logiqa-en',
+    'sat-math',
+    'sat-en',
+    'sat-en-without-passage',
+    'aqua-rat',
+    'math',
+]
+agieval_gaokao_sets = [
+    'gaokao-chinese',
+    'gaokao-english',
+    'gaokao-geography',
+    'gaokao-history',
+    'gaokao-biology',
+    'gaokao-chemistry',
+    'gaokao-physics',
+    'gaokao-mathqa',
+]
+
+agieval_datasets = []
+for _name in agieval_single_choice_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _name in agieval_multiple_choices_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess_multi))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _name in agieval_cloze_sets:
+    if _name in agieval_chinese_sets:
+        _hint = '答案是： '
+    else:
+        _hint = 'The answer is '
+    agieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+    agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
+
+    agieval_datasets.append(
+        dict(
+            type=AGIEvalDataset_v2,
+            path='./data/AGIEval/data/v1/',
+            name=_name,
+            abbr='agieval-' + _name,
+            setting_name='zero-shot',
+            reader_cfg=agieval_reader_cfg,
+            infer_cfg=agieval_infer_cfg.copy(),
+            eval_cfg=agieval_eval_cfg.copy()))
+
+for _item in agieval_datasets:
+    _name = _item['name']
+    _intro = {
+        'gaokao-chinese':
+        '以下是一道中国高考语文选择题，请选择正确的答案。',
+        'gaokao-english':
+        '以下是一道中国高考英语选择题，请选择正确的答案。',
+        'gaokao-geography':
+        '以下是一道中国高考地理选择题，请选择正确的答案。',
+        'gaokao-history':
+        '以下是一道中国高考历史选择题，请选择正确的答案。',
+        'gaokao-biology':
+        '以下是一道中国高考生物选择题，请选择正确的答案。',
+        'gaokao-chemistry':
+        '以下是一道中国高考化学选择题，请选择正确的答案。',
+        'gaokao-physics':
+        '以下是一道中国高考物理选择题，请选择正确的答案。',
+        'gaokao-mathqa':
+        '以下是一道中国高考数学选择题，请选择正确的答案。',
+        'logiqa-zh':
+        '以下是一道中国公务员考试题，请选择正确的答案。',
+        'lsat-ar':
+        'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
+        'lsat-lr':
+        'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
+        'lsat-rc':
+        'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
+        'logiqa-en':
+        'The following is a Logic Reasoning question. Please select the correct answer.',
+        'sat-math':
+        'The following is a SAT Math question. Please select the correct answer.',
+        'sat-en':
+        'The following is a SAT English question. Please select the correct answer.',
+        'sat-en-without-passage':
+        'The following is a SAT English question. Please select the correct answer.',
+        'aqua-rat':
+        'The following is a AQUA-RAT question. Please select the correct answer.',
+        'jec-qa-kd':
+        '以下是一道中国司法考试基础知识题，请选择正确的答案。',
+        'jec-qa-ca':
+        '以下是一道中国司法考试案例分析题，请选择正确的答案。',
+        'gaokao-mathcloze':
+        '以下是一道中国高考数学填空题，请填入正确的答案。',
+        'math':
+        'The following is a Math question. Please select the correct answer.',
+    }[_name]
+    _templates = _item['infer_cfg']['prompt_template']['template']
+    _templates['round'][0][
+        'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
+
+del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
diff --git a/configs/datasets/agieval/agieval_mixed_2f14ad.py b/configs/datasets/agieval/agieval_mixed_2f14ad.py
index c9c952c6..169f8fe9 100644
--- a/configs/datasets/agieval/agieval_mixed_2f14ad.py
+++ b/configs/datasets/agieval/agieval_mixed_2f14ad.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
+from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
 from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
 
 agieval_single_choice_sets = [
@@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
         inferencer=dict(type=GenInferencer, max_out_len=1024))
 
     agieval_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=AGIEvalEvaluator_mcq),
         pred_postprocessor=dict(type=first_capital_postprocess_multi))
 
     agieval_datasets.append(
diff --git a/configs/datasets/bbh/bbh_gen_5b92b0.py b/configs/datasets/bbh/bbh_gen_5b92b0.py
index e3be3dce..91b38ac9 100644
--- a/configs/datasets/bbh/bbh_gen_5b92b0.py
+++ b/configs/datasets/bbh/bbh_gen_5b92b0.py
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
 
 bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
 
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
         retriever=dict(type=ZeroRetriever),
         inferencer=dict(type=GenInferencer, max_out_len=512))
     bbh_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=BBHEvaluator_mcq),
         pred_role="BOT",
         pred_postprocessor=dict(type=bbh_mcq_postprocess),
         dataset_postprocessor=dict(type=bbh_mcq_postprocess))
diff --git a/configs/datasets/bbh/bbh_gen_5bf00b.py b/configs/datasets/bbh/bbh_gen_5bf00b.py
index 1c814d01..ec854d37 100644
--- a/configs/datasets/bbh/bbh_gen_5bf00b.py
+++ b/configs/datasets/bbh/bbh_gen_5bf00b.py
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
 
 bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
 
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
         retriever=dict(type=ZeroRetriever),
         inferencer=dict(type=GenInferencer, max_out_len=512))
     bbh_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
+        evaluator=dict(type=BBHEvaluator_mcq),
         pred_role="BOT",
         pred_postprocessor=dict(type=bbh_mcq_postprocess),
         dataset_postprocessor=dict(type=bbh_mcq_postprocess))
diff --git a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
index 2bcb9c6f..15217aa2 100644
--- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 
@@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
-gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                       pred_postprocessor=dict(type=gsm8k_postprocess),
                       dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
 
diff --git a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py
index 0e146a48..0e0860ed 100644
--- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 
@@ -72,7 +72,7 @@ Question: {question}{answer}
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 gsm8k_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=Gsm8kEvaluator),
     pred_postprocessor=dict(type=gsm8k_postprocess),
     dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
 
diff --git a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
index 16f27213..9d7657f4 100644
--- a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import SCInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' )
 generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40)
@@ -73,7 +73,7 @@ Question: {question}{answer}
     inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20))
 
 gsm8k_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=Gsm8kEvaluator),
     pred_postprocessor=dict(type=gsm8k_postprocess),
     dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
     sc_size = 20)
diff --git a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
index f351c901..a5a9974b 100644
--- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 
@@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=GenInferencer))
 
-gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                       pred_role="BOT",
                       pred_postprocessor=dict(type=gsm8k_postprocess),
                       dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
diff --git a/configs/models/claude/claude.py b/configs/models/claude/claude.py
index 7b52c637..b57a116e 100644
--- a/configs/models/claude/claude.py
+++ b/configs/models/claude/claude.py
@@ -1,6 +1,8 @@
 from opencompass.models.claude_api.claude_api import Claude
+from opencompass.models.claude_api.postprocessors import (
+    flores_postprocess, gsm8k_postprocess, humaneval_postprocess,
+    lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess)
 from opencompass.utils.text_postprocessors import last_option_postprocess
-from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess
 
 agieval_single_choice_sets = [
     'gaokao-chinese',
@@ -47,6 +49,8 @@ claude_postprocessors = {
     'lcsts': dict(type=lcsts_postprocess),
     'mbpp': dict(type=mbpp_postprocess),
     'strategyqa': dict(type=strategyqa_pred_postprocess),
+    'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'),
+    'flores_100_*-zho_simpl': dict(type=flores_postprocess),
 }
 
 for _name in agieval_multiple_choices_sets + agieval_single_choice_sets:
diff --git a/docs/en/user_guides/experimentation.md b/docs/en/user_guides/experimentation.md
index 56f2900a..96b3cca3 100644
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@@ -5,7 +5,7 @@
 The program entry for the evaluation task is `run.py`. The usage is as follows:
 
 ```shell
-python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
+python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
 ```
 
 Task Configuration (`$EXP`):
@@ -66,6 +66,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
+- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.
 
 Using run mode `-m all` as an example, the overall execution flow is as follows:
 
diff --git a/docs/zh_cn/user_guides/experimentation.md b/docs/zh_cn/user_guides/experimentation.md
index 31cafc59..5b781f27 100644
--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@@ -5,7 +5,7 @@
 评测任务的程序入口为 `run.py`，使用方法如下：
 
 ```shell
-python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
+python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
 ```
 
 任务配置 (`$EXP`)：
@@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
+- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。
 
 以运行模式 `-m all` 为例，整体运行流如下：
 
diff --git a/opencompass/datasets/afqmcd.py b/opencompass/datasets/afqmcd.py
index 11a15ab2..f23ae6c3 100644
--- a/opencompass/datasets/afqmcd.py
+++ b/opencompass/datasets/afqmcd.py
@@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 line['label'] = 'AB'[int(line['label'])]
diff --git a/opencompass/datasets/agieval/agieval.py b/opencompass/datasets/agieval/agieval.py
index 5e76979f..e10a17cc 100644
--- a/opencompass/datasets/agieval/agieval.py
+++ b/opencompass/datasets/agieval/agieval.py
@@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
 
     def score(self, predictions, references):
         predictions = [parse_math_answer('', pred) for pred in predictions]
+        details = []
         cnt = 0
         for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
             if is_equiv(pred, ref):
                 cnt += 1
+                detail['correct'] = True
+            details.append(detail)
         score = cnt / len(predictions) * 100
-        return {'score': score}
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class AGIEvalEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        details = []
+        cnt = 0
+        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            if pred == ref:
+                cnt += 1
+                detail['correct'] = True
+            details.append(detail)
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score, 'details': details}
diff --git a/opencompass/datasets/bbh.py b/opencompass/datasets/bbh.py
index e803ca9e..38f3de39 100644
--- a/opencompass/datasets/bbh.py
+++ b/opencompass/datasets/bbh.py
@@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
 
         predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
 
+        details = []
         cnt = 0
         for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
             if pred == ref:
                 cnt += 1
+                detail['correct'] = True
+            details.append(detail)
 
         score = cnt / len(predictions) * 100
 
-        return {'score': score}
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class BBHEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        details = []
+        cnt = 0
+        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            if pred == ref:
+                cnt += 1
+                detail['correct'] = True
+            details.append(detail)
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score, 'details': details}
diff --git a/opencompass/datasets/bustum.py b/opencompass/datasets/bustum.py
index d145f4f9..0f7a02f9 100644
--- a/opencompass/datasets/bustum.py
+++ b/opencompass/datasets/bustum.py
@@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 line['label'] = 'AB'[int(line['label'])]
diff --git a/opencompass/datasets/c3.py b/opencompass/datasets/c3.py
index 1c630675..cda3ec58 100644
--- a/opencompass/datasets/c3.py
+++ b/opencompass/datasets/c3.py
@@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
     @staticmethod
     def load(path: str):
 
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
         rows = []
         for _, row in enumerate(data):
@@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
 
     @staticmethod
     def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
             raw = json.load(f)
         data = []
         for line in raw:
diff --git a/opencompass/datasets/ceval.py b/opencompass/datasets/ceval.py
index 366e976b..b9f3476f 100644
--- a/opencompass/datasets/ceval.py
+++ b/opencompass/datasets/ceval.py
@@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
     def load(path: str, name: str):
         dataset = {}
         for split in ['dev', 'val', 'test']:
-            with open(osp.join(path, split, f'{name}_{split}.csv')) as f:
+            filename = osp.join(path, split, f'{name}_{split}.csv')
+            with open(filename, encoding='utf-8') as f:
                 reader = csv.reader(f)
                 header = next(reader)
                 for row in reader:
diff --git a/opencompass/datasets/chid.py b/opencompass/datasets/chid.py
index 6c218edc..a7a4ae5c 100644
--- a/opencompass/datasets/chid.py
+++ b/opencompass/datasets/chid.py
@@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 item = {}
diff --git a/opencompass/datasets/cluewsc.py b/opencompass/datasets/cluewsc.py
index 5f5e0803..8f62b344 100644
--- a/opencompass/datasets/cluewsc.py
+++ b/opencompass/datasets/cluewsc.py
@@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 item = {
diff --git a/opencompass/datasets/cmb.py b/opencompass/datasets/cmb.py
index 5f53ec14..684c88f5 100644
--- a/opencompass/datasets/cmb.py
+++ b/opencompass/datasets/cmb.py
@@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):
 
     @staticmethod
     def load(path: str):
-        with open(osp.join(path, 'test.json'), 'r') as f:
+        with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
             test_data = json.load(f)
-        with open(osp.join(path, 'val.json'), 'r') as f:
+        with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
             val_data = json.load(f)
 
         for da in test_data:
diff --git a/opencompass/datasets/cmnli.py b/opencompass/datasets/cmnli.py
index 9cd9243c..653148d3 100644
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 if line['label'] == '-':
diff --git a/opencompass/datasets/cmrc.py b/opencompass/datasets/cmrc.py
index bb388976..fcb0a847 100644
--- a/opencompass/datasets/cmrc.py
+++ b/opencompass/datasets/cmrc.py
@@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):
 
     @staticmethod
     def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
         # 将原始数据转换为所需的格式
         rows = []
diff --git a/opencompass/datasets/copa.py b/opencompass/datasets/copa.py
index 34ad4039..3aaa195e 100644
--- a/opencompass/datasets/copa.py
+++ b/opencompass/datasets/copa.py
@@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 line['label'] = 'AB'[line['label']]
diff --git a/opencompass/datasets/csl.py b/opencompass/datasets/csl.py
index e9379f4f..1994b44c 100644
--- a/opencompass/datasets/csl.py
+++ b/opencompass/datasets/csl.py
@@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 item = {
diff --git a/opencompass/datasets/drcd.py b/opencompass/datasets/drcd.py
index 44466242..66bd0ca9 100644
--- a/opencompass/datasets/drcd.py
+++ b/opencompass/datasets/drcd.py
@@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):
 
     @staticmethod
     def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
         # 将原始数据转换为所需的格式
         rows = []
diff --git a/opencompass/datasets/eprstmt.py b/opencompass/datasets/eprstmt.py
index dd14b960..d333b3cf 100644
--- a/opencompass/datasets/eprstmt.py
+++ b/opencompass/datasets/eprstmt.py
@@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
     @staticmethod
     def load(path):
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 item = {
diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py
index 089a5a7c..b300e598 100644
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@@ -1,3 +1,4 @@
+from opencompass.openicl import BaseEvaluator
 from opencompass.registry import TEXT_POSTPROCESSORS
 
 
@@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
         if ret[i].isdigit():
             ret1 += ret[i]
     return ret1
+
+
+class Gsm8kEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        correct = 0
+        count = 0
+        details = []
+        for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answers': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py
index f6d2a7ba..4541ca18 100644
--- a/opencompass/datasets/hellaswag.py
+++ b/opencompass/datasets/hellaswag.py
@@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
     @staticmethod
     def load(path):
         dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 data = json.loads(line)
                 dataset.append({
diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py
index bdd02650..698329e0 100644
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
             }
         correct = 0
         count = 0
+        details = []
         for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answer': j, 'correct': False}
             count += 1
             if self.is_equiv(i, j):
                 correct += 1
-        result = {'accuracy': 100 * correct / count}
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
         return result
 
     def _fix_fracs(self, string):
diff --git a/opencompass/datasets/natural_question.py b/opencompass/datasets/natural_question.py
index 6853b099..9882a1db 100644
--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
@@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
         processed_answers = [[general_postprocess(j).lower() for j in i]
                              for i in references]
 
+        details = []
         cnt = 0
         for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
             cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
         score = cnt / len(predictions) * 100
 
-        return {'score': score}
+        return {'score': score, 'details': details}
diff --git a/opencompass/datasets/tnews.py b/opencompass/datasets/tnews.py
index 79cdc273..606ea40c 100644
--- a/opencompass/datasets/tnews.py
+++ b/opencompass/datasets/tnews.py
@@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
         }
 
         data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = json.loads(line)
                 item = {
diff --git a/opencompass/datasets/triviaqa.py b/opencompass/datasets/triviaqa.py
index 122e8707..e4b11bdb 100644
--- a/opencompass/datasets/triviaqa.py
+++ b/opencompass/datasets/triviaqa.py
@@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
         processed_answers = [[general_postprocess(j).lower() for j in i]
                              for i in references]
 
+        details = []
         cnt = 0
         for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
             cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
         score = cnt / len(predictions) * 100
 
-        return {'score': score}
+        return {'score': score, 'details': details}
diff --git a/opencompass/models/claude_api/postprocessors.py b/opencompass/models/claude_api/postprocessors.py
index 878f1669..3df242cf 100644
--- a/opencompass/models/claude_api/postprocessors.py
+++ b/opencompass/models/claude_api/postprocessors.py
@@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
     return ''
 
 
+def flores_postprocess(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    return text
+
+
+def flores_postprocess_chinese(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    import jieba
+    truncated_text = text.strip().split('\n')[0]
+    cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
+    cleaned_text = ' '.join(jieba.cut(cleaned_text))
+    return cleaned_text
+
+
 def record_postprocess(text: str) -> str:
     match = re.search(r'(?<=refers to )[^.]+', text)
 
diff --git a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
index 169f9966..e8e08128 100644
--- a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
@@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
                              for i in references]
 
         cnt = 0
+        details = []
         for pred, ans, origin_ans in zip(predictions, processed_answers,
                                          references):
+            answers = list(set(ans + origin_ans))
+            detail = {'pred': pred, 'answer': answers}
             if pred in ans or pred in origin_ans:
                 cnt += 1
+                detail['correct'] = True
+            else:
+                detail['correct'] = False
+            details.append(detail)
 
         score = cnt / len(predictions) * 100
 
-        return {'score': score}
+        return {'score': score, 'details': details}
diff --git a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
index fd3fbde7..1775ba12 100644
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
@@ -51,8 +51,7 @@ class BaseInferencer:
         self.output_json_filepath = output_json_filepath
         self.output_json_filename = output_json_filename
         self.is_main_process = is_main_process()
-        if not os.path.exists(self.output_json_filepath):
-            os.makedirs(self.output_json_filepath)
+        os.makedirs(self.output_json_filepath, exist_ok=True)
 
     def inference(self,
                   retriever: BaseRetriever,
diff --git a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
index 606afd86..0fa60bee 100644
--- a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
@@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
             index = 0
             prompt_list = []
             sub_ppl_list = []
+            token_num_list = []
             normalizing_prompt_list = []
             context_length_list = []
 
@@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
                                                                mode='ppl'))
                     normalizing_prompt_list.append(normalizing_prompt)
                 prompt_list.append(prompt)
+                token_num_list.append(prompt_token_num)
 
             if normalizing_str is not None:
                 normalizing_str_len = self.model.get_token_len_from_template(
@@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
                     ice_str = self.model.parse_template(ice[idx], mode='ppl')
                     output_handler.save_prompt_and_ppl(
                         label, prompt.replace(ice_str, ''), prompt, res, index)
+                    output_handler.results_dict[str(
+                        index)][f'label: {str(label)}'][
+                            'BPB'] = res * token_num_list[idx] / len(
+                                prompt.replace(ice_str, '').encode())
                     index = index + 1
             ppl.append(sub_ppl_list)
 
diff --git a/opencompass/partitioners/base.py b/opencompass/partitioners/base.py
index c6015bbd..b3c4b1ee 100644
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from copy import deepcopy
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from mmengine.config import ConfigDict
 
@@ -13,16 +13,24 @@ class BasePartitioner:
 
     Args:
         out_dir (str): The output directory of tasks.
-        keep_keys (List[str]): The keys to be kept from the experiment config
-            to the task config.
+        keep_keys (Optional[List[str]], optional): The keys to be kept from the
+            experiment config to the task config. Defaults to None. If None,
+            the following keys will be kept:
+
+            - eval.runner.task.judge_cfg
+            - eval.runner.task.dump_details
     """
 
-    def __init__(self,
-                 out_dir: str,
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+    def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
         self.logger = get_logger()
         self.out_dir = out_dir
-        self.keep_keys = keep_keys
+        if keep_keys is None:
+            self.keep_keys = [
+                'eval.runner.task.judge_cfg',
+                'eval.runner.task.dump_details',
+            ]
+        else:
+            self.keep_keys = keep_keys
 
     def __call__(self, cfg: ConfigDict) -> List[Dict]:
         """Generate tasks from config. Each task is defined as a
@@ -63,7 +71,8 @@ class BasePartitioner:
                     tgt_ptr = tgt_ptr[key]
                 tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
             except Exception:
-                self.logger.warning(f'Key {k} not found in config, ignored.')
+                self.logger.debug(f'Key {k} not found in config, ignored.')
+        self.logger.debug(f'Additional config: {add_cfg}')
 
         tasks = self.partition(models,
                                datasets,
diff --git a/opencompass/partitioners/naive.py b/opencompass/partitioners/naive.py
index 42bfcf57..cc638ad9 100644
--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
@@ -1,5 +1,5 @@
 import os.path as osp
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from mmengine.config import Config, ConfigDict
 
@@ -11,15 +11,23 @@ from .base import BasePartitioner
 
 @PARTITIONERS.register_module()
 class NaivePartitioner(BasePartitioner):
-    """Naive task partitioner. This partitioner will generate a task for each
-    model-dataset pair.
+    """Naive task partitioner. This partitioner will generate a task for each n
+    model-dataset pairs.
 
     Args:
         out_dir (str): The output directory of tasks.
+        n (int): The number of model-dataset pairs in each task.
         keep_keys (List[str]): The keys to be kept from the experiment config
             to the task config.
     """
 
+    def __init__(self,
+                 out_dir: str,
+                 n: int = 1,
+                 keep_keys: Optional[List[str]] = None):
+        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
+        self.n = n
+
     def partition(self,
                   models: List[ConfigDict],
                   datasets: List[ConfigDict],
@@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):
 
         tasks = []
         for model in models:
+            chunks = []
             for dataset in datasets:
                 filename = get_infer_output_path(model, dataset, out_dir)
                 if osp.exists(filename):
                     continue
+                chunks.append(dataset)
+
+            for i in range(0, len(chunks), self.n):
                 task = Config({
                     'models': [model],
-                    'datasets': [[dataset]],
+                    'datasets': [chunks[i:i + self.n]],
                     'work_dir': work_dir,
                     **add_cfg
                 })
diff --git a/opencompass/partitioners/size.py b/opencompass/partitioners/size.py
index 3bbd17fa..7e843917 100644
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@@ -2,7 +2,7 @@ import copy
 import math
 import os.path as osp
 from fnmatch import fnmatch
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import mmengine
 from mmengine.config import Config, ConfigDict
@@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
         max_task_size (int): The maximum size of a task.
         gen_task_coef (int): The dataset cost measurement coefficient for
             generation tasks.
+        strategy (str): The partition strategy. Supported strategies are:
+            'heuristic' and 'split'. Defaults to 'heuristic'.
+            heuristic: split large datasets into several tasks, merge small
+                datasets into one task.
+            split: split large datasets into several tasks only.
         dataset_size_path (str): The path to the dataset size cache file.
         keep_keys (list[str]): The keys to be kept from the experiment config
             to the task config.
@@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
                  out_dir: str,
                  max_task_size: int = 40000,
                  gen_task_coef: int = 20,
+                 strategy: str = 'heuristic',
                  dataset_size_path: str = '.cache/dataset_size.json',
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+                 keep_keys: Optional[List[str]] = None):
         super().__init__(out_dir=out_dir, keep_keys=keep_keys)
         self.max_task_size = max_task_size
         self.gen_task_coef = gen_task_coef
         self.dataset_size_path = dataset_size_path
+        assert strategy in ('heuristic', 'split'), \
+            f'Unsupported partition strategy: {strategy}. '\
+            'Supported strategies are: `heuristic`, `split` .'
+        self.strategy = strategy
 
     def partition(self,
                   models: List[ConfigDict],
@@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
                           reverse=True)
         tasks = []
         for model in models:
-            task = Config({
-                'models': [model],
-                'datasets': [[]],
-                'work_dir': work_dir,
-                **add_cfg
-            })
-            num_data = 0
+            chunks = []  # elements: tuple(size, dataset_chunk)
             for dataset in datasets:
                 filename = get_infer_output_path(model, dataset, out_dir)
-                root, ext = osp.splitext(filename)
                 # skip the task if the task output exists
                 if osp.exists(filename):
                     continue
                 dataset_size = self.get_cost(dataset)
                 if dataset_size > self.max_task_size:
+                    root, ext = osp.splitext(filename)
                     dataset_splits = self.split_dataset(dataset)
                     for i, dataset_split in enumerate(dataset_splits):
-                        # skip the task it the task output exists
                         if not osp.exists(f'{root}_{i}{ext}'):
-                            tasks.append(
-                                Config({
-                                    'models': [model],
-                                    'datasets': [[dataset_split]],
-                                    'work_dir': work_dir,
-                                    **add_cfg
-                                }))
+                            chunks.append((self.max_task_size, dataset_split))
                 else:
-                    if num_data + dataset_size > self.max_task_size:
-                        tasks.append(task)
-                        task = Config({
+                    chunks.append((dataset_size, dataset))
+
+            if self.strategy == 'heuristic':
+                chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
+                current_size, current_chunks = 0, []
+                for index in range(len(chunks)):
+                    current_size += chunks[index][0]
+                    current_chunks.append(chunks[index][1])
+                    if index == len(chunks) - 1 or current_size + chunks[
+                            index + 1][0] > self.max_task_size:
+                        tasks.append(
+                            Config({
+                                'models': [model],
+                                'datasets': [current_chunks],
+                                'work_dir': work_dir,
+                                **add_cfg
+                            }))
+                        current_size, current_chunks = 0, []
+            elif self.strategy == 'split':
+                for _, dataset in chunks:
+                    tasks.append(
+                        Config({
                             'models': [model],
-                            'datasets': [[]],
+                            'datasets': [[dataset]],
                             'work_dir': work_dir,
                             **add_cfg
-                        })
-                        num_data = 0
-                    task['datasets'][0].append(dataset)
-                    num_data = num_data + dataset_size
-            if task['datasets'][0]:
-                tasks.append(task)
-
+                        }))
         return tasks
 
     @property
diff --git a/opencompass/partitioners/sub_naive.py b/opencompass/partitioners/sub_naive.py
index a7417539..d4624b2e 100644
--- a/opencompass/partitioners/sub_naive.py
+++ b/opencompass/partitioners/sub_naive.py
@@ -23,7 +23,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
                  mode: str,
                  out_dir: str,
                  model_pairs: Optional[List[Tuple]] = None,
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+                 keep_keys: Optional[List[str]] = None):
         super().__init__(out_dir=out_dir, keep_keys=keep_keys)
         assert mode in ['all', 'one_to_n', 'fixed']
         self.mode = mode
diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py
index 447a5a18..77b6cdef 100644
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -72,6 +72,7 @@ class DefaultSummarizer:
                 if not osp.exists(filepath):
                     continue
                 result = mmengine.load(filepath)
+                result.pop('details', None)
                 raw_results[model_abbr][dataset_abbr] = result
                 if 'error' in result:
                     self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index 5538a517..68be3d27 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -1,11 +1,14 @@
 import argparse
+import copy
 import fnmatch
+import math
 import os.path as osp
+import statistics
 import time
 from collections import Counter
 from inspect import signature
 from shutil import which
-from typing import Optional
+from typing import List, Optional
 
 import mmengine
 from mmengine.config import Config, ConfigDict
@@ -35,6 +38,8 @@ class OpenICLEvalTask(BaseTask):
         super().__init__(cfg)
         self.num_gpus = 0
         self.logger = get_logger()
+        self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
+            'task', {}).get('dump_details', False)
 
     def get_command(self, cfg_path, template):
         script_path = __file__
@@ -113,7 +118,7 @@ class OpenICLEvalTask(BaseTask):
                         [sub_preds[str(i)] for i in range(len(sub_preds))])
                     filename = root + f'_{i}' + ext
                     i += 1
-
+            pred_dicts = copy.deepcopy(preds)
             preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
 
             pred_strs = preds.pop('prediction')
@@ -163,6 +168,7 @@ class OpenICLEvalTask(BaseTask):
                 ]
 
             icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
+
             preds['predictions'] = pred_strs
             preds['references'] = (test_set[self.output_column]
                                    if self.output_column else None)
@@ -172,18 +178,42 @@ class OpenICLEvalTask(BaseTask):
             }
             result = icl_evaluator.score(**preds)
 
+            if self.dump_details:
+                try:
+                    details = result.pop('details', None)
+                    result['details'] = self.format_details(
+                        pred_strs, test_set[self.output_column], details,
+                        pred_dicts)
+                    result['type'] = result['details'].pop('type', None)
+
+                    if 'PPL' in str(
+                            self.dataset_cfg.infer_cfg.inferencer.type):
+                        result['correct_bpb'], result[
+                            'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
+                    else:
+                        result['incorrect_bpb'] = result['correct_bpb'] = -1
+                except Exception:
+                    result['incorrect_bpb'] = result['correct_bpb'] = -1
+            else:
+                result.pop('details', None)
+
         if 'error' in result:
             self.logger.error(
                 f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
             return
         else:
-            self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
+            result_wo_details = {
+                i: result[i]
+                for i in result if i != 'details'
+            }
+            self.logger.info(
+                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
 
         # Save result
         out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
                                          osp.join(self.work_dir, 'results'))
         mkdir_or_exist(osp.split(out_path)[0])
-        mmengine.dump(result, out_path)
+        mmengine.dump(result, out_path, ensure_ascii=False, indent=4)
 
     def _extract_role_pred(self, s: str, begin_str: Optional[str],
                            end_str: Optional[str]) -> str:
@@ -215,6 +245,95 @@ class OpenICLEvalTask(BaseTask):
 
         return s[start:end]
 
+    def format_details(self, predictions, references, details, pred_dicts):
+        """This function is responsible for formatting prediction details.
+
+        Args:
+            predictions (list): The prediction list.
+            references (list): The reference list.
+            details (list): Contains the 'pred' 'answer' and 'correct' for each
+                sample. Such as `[{'pred': '光荣和ωforce',
+                'answers': ['光荣和ω-force', '光荣和ωforce'], 'correct': True}]`
+            pred_dicts (list): Contains a list of samples with the original
+                prompts. Such as
+                `[{'origin_prompt': '根据文章回答问题。你的答案应该尽可能3》…………',
+                'prediction': ' 光荣和ω-force\n', 'gold': ['光荣和ω-force']}]`
+
+        Returns:
+            list: The formatted prediction details.
+        """
+        results = {}
+        for i in range(len(predictions)):
+            ppl_flag = False
+            result = {}
+            origin_prediction = copy.deepcopy(pred_dicts[i])
+            origin_prediction.pop('in-context examples', None)
+            origin_prediction.pop('prediction', None)
+            keys = copy.deepcopy(list(origin_prediction.keys()))
+            for key in keys:
+                if key.startswith('label:'):
+                    ppl_flag = True
+                    origin_prediction[key].pop('testing input', None)
+                    new_key = key.replace('label: ', '')
+                    origin_prediction[new_key] = origin_prediction.pop(key)
+            if ppl_flag:
+                results['type'] = 'PPL'
+                result['origin_prediction'] = origin_prediction
+                result['predictions'] = str(predictions[i])
+                result['references'] = str(references[i])
+                result['correct'] = str(predictions[i]) == str(references[i])
+            else:
+                results['type'] = 'GEN'
+                result['prompt'] = origin_prediction['origin_prompt']
+                result['origin_prediction'] = pred_dicts[i]['prediction']
+                result['predictions'] = details[i]['pred']
+                result['references'] = details[i]['answers']
+                result['correct'] = details[i]['correct']
+            results[str(i)] = result
+        return results
+
+    def calculate_bpb(self, pred_dicts: List):
+        """This function is used to calculate the BPB (Bits Per Byte) for the
+        data. The correct BPB is obtained directly from the values in the
+        'predictions' file. The incorrect BPB is the average of the remaining
+        BPB values for each sample under different labels after subtracting the
+        correct BPB. The calculation of BPB (Bits Per Byte) is similar to PPL,
+        with the difference that it computes the additional bits needed on
+        average, in terms of character length, to encode the true sequence
+        based on the predictions. This calculation involves applying a
+        weighting factor based on the ratio of words to characters.
+
+        Args:
+            pred_dicts (list): Contains a list of samples with each options
+                and BPB scores.
+
+        Returns:
+            dict: Contains correct and incorrect bpb.
+        """
+        incorrect_bpb_list = []
+        bpb_list = []
+        for pred_dict in pred_dicts:
+            preds = {
+                key: value
+                for key, value in pred_dict.items()
+                if key.startswith('label: ')
+            }
+            values = []
+            for item in preds.items():
+                values.append(item[1])
+            bpbs = [value['BPB'] for value in values]
+            incorrect_bpb_list.append(
+                (sum(bpbs) - min(bpbs)) / (len(bpbs) - 1))
+            bpb_list.append(statistics.mean(bpbs))
+
+        def filters(origins):
+            targets = [target for target in origins if not math.isnan(target)]
+            return targets
+
+        mean_incorrect = statistics.mean(filters(incorrect_bpb_list))
+        mean_correct = statistics.mean(filters(bpb_list))
+        return 100 * mean_correct, 100 * mean_incorrect
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Score Calculator')
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 288752b5..99d475b9 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -25,6 +25,7 @@ requests==2.31.0
 rouge
 rouge_chinese
 rouge_score
+sacrebleu
 scikit_learn==1.2.1
 seaborn
 sentence_transformers==2.2.2
diff --git a/run.py b/run.py
index f5512a67..fd323c58 100644
--- a/run.py
+++ b/run.py
@@ -123,6 +123,12 @@ def parse_args():
         'Will be overrideen by the "retry" argument in the config.',
         type=int,
         default=2)
+    parser.add_argument(
+        '--dump-eval-details',
+        help='Whether to dump the evaluation details, including the '
+        'correctness of each sample, bpb, etc.',
+        action='store_true',
+    )
     # set srun args
     slurm_parser = parser.add_argument_group('slurm_args')
     parse_slurm_args(slurm_parser)
@@ -300,6 +306,8 @@ def main():
 
         if args.dlc or args.slurm or cfg.get('eval', None) is None:
             fill_eval_cfg(cfg, args)
+        if args.dump_eval_details:
+            cfg.eval.runner.task.dump_details = True
 
         if args.partition is not None:
             if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: