diff --git a/.gitignore b/.gitignore index 23bf2a52..f2eab368 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ configs/eval_debug*.py configs/viz_*.py data work_dirs - +configs/internal/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -86,3 +86,6 @@ docs/zh_cn/_build/ # .zip *.zip + +# sft config ignore list +configs/sft_cfg/*B_* diff --git a/configs/datasets/agieval/agieval_gen_397d81.py b/configs/datasets/agieval/agieval_gen_397d81.py new file mode 100644 index 00000000..523cb074 --- /dev/null +++ b/configs/datasets/agieval/agieval_gen_397d81.py @@ -0,0 +1,204 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi + +agieval_reader_cfg = dict( + input_columns=['question', 'options'], output_column='label') + +agieval_single_choice_sets = [ + 'gaokao-chinese', + 'gaokao-english', + 'gaokao-geography', + 'gaokao-history', + 'gaokao-biology', + 'gaokao-chemistry', + 'gaokao-physics', + 'gaokao-mathqa', + 'logiqa-zh', + 'lsat-ar', + 'lsat-lr', + 'lsat-rc', + 'logiqa-en', + 'sat-math', + 'sat-en', + 'sat-en-without-passage', + 'aqua-rat', +] +agieval_multiple_choices_sets = [ + 'jec-qa-kd', + 'jec-qa-ca', +] +agieval_cloze_sets = ['gaokao-mathcloze', 'math'] +agieval_chinese_sets = [ + 'gaokao-chinese', + 'gaokao-english', + 'gaokao-geography', + 'gaokao-history', + 'gaokao-biology', + 'gaokao-chemistry', + 'gaokao-physics', + 'gaokao-mathqa', + 'logiqa-zh', + 'gaokao-mathcloze', +] +agieval_english_sets = [ + 'lsat-ar', + 'lsat-lr', + 'lsat-rc', + 'logiqa-en', + 'sat-math', + 'sat-en', + 'sat-en-without-passage', + 'aqua-rat', + 'math', +] +agieval_gaokao_sets = [ + 'gaokao-chinese', + 'gaokao-english', + 'gaokao-geography', + 'gaokao-history', + 'gaokao-biology', + 'gaokao-chemistry', + 'gaokao-physics', + 'gaokao-mathqa', +] + +agieval_datasets = [] +for _name in agieval_single_choice_sets: + if _name in agieval_chinese_sets: + _hint = '答案是: ' + else: + _hint = 'The answer is ' + agieval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}') + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + + agieval_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess)) + + agieval_datasets.append( + dict( + type=AGIEvalDataset_v2, + path='./data/AGIEval/data/v1/', + name=_name, + abbr='agieval-' + _name, + setting_name='zero-shot', + reader_cfg=agieval_reader_cfg, + infer_cfg=agieval_infer_cfg.copy(), + eval_cfg=agieval_eval_cfg.copy())) + +for _name in agieval_multiple_choices_sets: + if _name in agieval_chinese_sets: + _hint = '答案是: ' + else: + _hint = 'The answer is ' + agieval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}') + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + + agieval_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess_multi)) + + agieval_datasets.append( + dict( + type=AGIEvalDataset_v2, + path='./data/AGIEval/data/v1/', + name=_name, + abbr='agieval-' + _name, + setting_name='zero-shot', + reader_cfg=agieval_reader_cfg, + infer_cfg=agieval_infer_cfg.copy(), + eval_cfg=agieval_eval_cfg.copy())) + +for _name in agieval_cloze_sets: + if _name in agieval_chinese_sets: + _hint = '答案是: ' + else: + _hint = 'The answer is ' + agieval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + + agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator)) + + agieval_datasets.append( + dict( + type=AGIEvalDataset_v2, + path='./data/AGIEval/data/v1/', + name=_name, + abbr='agieval-' + _name, + setting_name='zero-shot', + reader_cfg=agieval_reader_cfg, + infer_cfg=agieval_infer_cfg.copy(), + eval_cfg=agieval_eval_cfg.copy())) + +for _item in agieval_datasets: + _name = _item['name'] + _intro = { + 'gaokao-chinese': + '以下是一道中国高考语文选择题,请选择正确的答案。', + 'gaokao-english': + '以下是一道中国高考英语选择题,请选择正确的答案。', + 'gaokao-geography': + '以下是一道中国高考地理选择题,请选择正确的答案。', + 'gaokao-history': + '以下是一道中国高考历史选择题,请选择正确的答案。', + 'gaokao-biology': + '以下是一道中国高考生物选择题,请选择正确的答案。', + 'gaokao-chemistry': + '以下是一道中国高考化学选择题,请选择正确的答案。', + 'gaokao-physics': + '以下是一道中国高考物理选择题,请选择正确的答案。', + 'gaokao-mathqa': + '以下是一道中国高考数学选择题,请选择正确的答案。', + 'logiqa-zh': + '以下是一道中国公务员考试题,请选择正确的答案。', + 'lsat-ar': + 'The following is a LSAT Analytical Reasoning question. Please select the correct answer.', + 'lsat-lr': + 'The following is a LSAT Logical Reasoning question. Please select the correct answer.', + 'lsat-rc': + 'The following is a LSAT Reading Comprehension question. Please select the correct answer.', + 'logiqa-en': + 'The following is a Logic Reasoning question. Please select the correct answer.', + 'sat-math': + 'The following is a SAT Math question. Please select the correct answer.', + 'sat-en': + 'The following is a SAT English question. Please select the correct answer.', + 'sat-en-without-passage': + 'The following is a SAT English question. Please select the correct answer.', + 'aqua-rat': + 'The following is a AQUA-RAT question. Please select the correct answer.', + 'jec-qa-kd': + '以下是一道中国司法考试基础知识题,请选择正确的答案。', + 'jec-qa-ca': + '以下是一道中国司法考试案例分析题,请选择正确的答案。', + 'gaokao-mathcloze': + '以下是一道中国高考数学填空题,请填入正确的答案。', + 'math': + 'The following is a Math question. Please select the correct answer.', + }[_name] + _templates = _item['infer_cfg']['prompt_template']['template'] + _templates['round'][0][ + 'prompt'] = _intro + '\n' + _templates['round'][0]['prompt'] + +del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg diff --git a/configs/datasets/agieval/agieval_mixed_2f14ad.py b/configs/datasets/agieval/agieval_mixed_2f14ad.py index c9c952c6..169f8fe9 100644 --- a/configs/datasets/agieval/agieval_mixed_2f14ad.py +++ b/configs/datasets/agieval/agieval_mixed_2f14ad.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator +from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq from opencompass.utils.text_postprocessors import first_capital_postprocess_multi agieval_single_choice_sets = [ @@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets: inferencer=dict(type=GenInferencer, max_out_len=1024)) agieval_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=AGIEvalEvaluator_mcq), pred_postprocessor=dict(type=first_capital_postprocess_multi)) agieval_datasets.append( diff --git a/configs/datasets/bbh/bbh_gen_5b92b0.py b/configs/datasets/bbh/bbh_gen_5b92b0.py index e3be3dce..91b38ac9 100644 --- a/configs/datasets/bbh/bbh_gen_5b92b0.py +++ b/configs/datasets/bbh/bbh_gen_5b92b0.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq bbh_reader_cfg = dict(input_columns=["input"], output_column="target") @@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets: retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer, max_out_len=512)) bbh_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=BBHEvaluator_mcq), pred_role="BOT", pred_postprocessor=dict(type=bbh_mcq_postprocess), dataset_postprocessor=dict(type=bbh_mcq_postprocess)) diff --git a/configs/datasets/bbh/bbh_gen_5bf00b.py b/configs/datasets/bbh/bbh_gen_5bf00b.py index 1c814d01..ec854d37 100644 --- a/configs/datasets/bbh/bbh_gen_5bf00b.py +++ b/configs/datasets/bbh/bbh_gen_5bf00b.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq bbh_reader_cfg = dict(input_columns=["input"], output_column="target") @@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets: retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer, max_out_len=512)) bbh_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=BBHEvaluator_mcq), pred_role="BOT", pred_postprocessor=dict(type=bbh_mcq_postprocess), dataset_postprocessor=dict(type=bbh_mcq_postprocess)) diff --git a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py index 2bcb9c6f..15217aa2 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict( retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer, max_out_len=512)) -gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), +gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator), pred_postprocessor=dict(type=gsm8k_postprocess), dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) diff --git a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py index 0e146a48..0e0860ed 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -72,7 +72,7 @@ Question: {question}{answer} inferencer=dict(type=GenInferencer, max_out_len=512)) gsm8k_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=Gsm8kEvaluator), pred_postprocessor=dict(type=gsm8k_postprocess), dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) diff --git a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py index 16f27213..9d7657f4 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py +++ b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import SCInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' ) generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40) @@ -73,7 +73,7 @@ Question: {question}{answer} inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20)) gsm8k_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=Gsm8kEvaluator), pred_postprocessor=dict(type=gsm8k_postprocess), dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), sc_size = 20) diff --git a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py index f351c901..a5a9974b 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py +++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict( retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer)) -gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), +gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator), pred_role="BOT", pred_postprocessor=dict(type=gsm8k_postprocess), dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) diff --git a/configs/models/claude/claude.py b/configs/models/claude/claude.py index 7b52c637..b57a116e 100644 --- a/configs/models/claude/claude.py +++ b/configs/models/claude/claude.py @@ -1,6 +1,8 @@ from opencompass.models.claude_api.claude_api import Claude +from opencompass.models.claude_api.postprocessors import ( + flores_postprocess, gsm8k_postprocess, humaneval_postprocess, + lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess) from opencompass.utils.text_postprocessors import last_option_postprocess -from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess agieval_single_choice_sets = [ 'gaokao-chinese', @@ -47,6 +49,8 @@ claude_postprocessors = { 'lcsts': dict(type=lcsts_postprocess), 'mbpp': dict(type=mbpp_postprocess), 'strategyqa': dict(type=strategyqa_pred_postprocess), + 'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'), + 'flores_100_*-zho_simpl': dict(type=flores_postprocess), } for _name in agieval_multiple_choices_sets + agieval_single_choice_sets: diff --git a/docs/en/user_guides/experimentation.md b/docs/en/user_guides/experimentation.md index 56f2900a..96b3cca3 100644 --- a/docs/en/user_guides/experimentation.md +++ b/docs/en/user_guides/experimentation.md @@ -5,7 +5,7 @@ The program entry for the evaluation task is `run.py`. The usage is as follows: ```shell -python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] +python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details] ``` Task Configuration (`$EXP`): @@ -66,6 +66,7 @@ The parameter explanation is as follows: - `-w`: Specify the working path, default is `./outputs/default`. - `-l`: Enable status reporting via Lark bot. - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging. +- `--dump-eval-details`: When enabled,evaluation under the `results` folder will include more details, such as the correctness of each sample. Using run mode `-m all` as an example, the overall execution flow is as follows: diff --git a/docs/zh_cn/user_guides/experimentation.md b/docs/zh_cn/user_guides/experimentation.md index 31cafc59..5b781f27 100644 --- a/docs/zh_cn/user_guides/experimentation.md +++ b/docs/zh_cn/user_guides/experimentation.md @@ -5,7 +5,7 @@ 评测任务的程序入口为 `run.py`,使用方法如下: ```shell -python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] +python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details] ``` 任务配置 (`$EXP`): @@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb - `-w`: 指定工作路径,默认为 `./outputs/default` - `-l`: 打开飞书机器人状态上报。 - `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试; +- `--dump-eval-details`: 开启时,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。 以运行模式 `-m all` 为例,整体运行流如下: diff --git a/opencompass/datasets/afqmcd.py b/opencompass/datasets/afqmcd.py index 11a15ab2..f23ae6c3 100644 --- a/opencompass/datasets/afqmcd.py +++ b/opencompass/datasets/afqmcd.py @@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) line['label'] = 'AB'[int(line['label'])] diff --git a/opencompass/datasets/agieval/agieval.py b/opencompass/datasets/agieval/agieval.py index 5e76979f..e10a17cc 100644 --- a/opencompass/datasets/agieval/agieval.py +++ b/opencompass/datasets/agieval/agieval.py @@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator): def score(self, predictions, references): predictions = [parse_math_answer('', pred) for pred in predictions] + details = [] cnt = 0 for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} if is_equiv(pred, ref): cnt += 1 + detail['correct'] = True + details.append(detail) score = cnt / len(predictions) * 100 - return {'score': score} + return {'score': score, 'details': details} + + +@ICL_EVALUATORS.register_module() +class AGIEvalEvaluator_mcq(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + details = [] + cnt = 0 + for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} + if pred == ref: + cnt += 1 + detail['correct'] = True + details.append(detail) + + score = cnt / len(predictions) * 100 + + return {'score': score, 'details': details} diff --git a/opencompass/datasets/bbh.py b/opencompass/datasets/bbh.py index e803ca9e..38f3de39 100644 --- a/opencompass/datasets/bbh.py +++ b/opencompass/datasets/bbh.py @@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator): predictions = [bbh_freeform_postprocess(pred) for pred in predictions] + details = [] cnt = 0 for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} if pred == ref: cnt += 1 + detail['correct'] = True + details.append(detail) score = cnt / len(predictions) * 100 - return {'score': score} + return {'score': score, 'details': details} + + +@ICL_EVALUATORS.register_module() +class BBHEvaluator_mcq(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + details = [] + cnt = 0 + for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} + if pred == ref: + cnt += 1 + detail['correct'] = True + details.append(detail) + + score = cnt / len(predictions) * 100 + + return {'score': score, 'details': details} diff --git a/opencompass/datasets/bustum.py b/opencompass/datasets/bustum.py index d145f4f9..0f7a02f9 100644 --- a/opencompass/datasets/bustum.py +++ b/opencompass/datasets/bustum.py @@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) line['label'] = 'AB'[int(line['label'])] diff --git a/opencompass/datasets/c3.py b/opencompass/datasets/c3.py index 1c630675..cda3ec58 100644 --- a/opencompass/datasets/c3.py +++ b/opencompass/datasets/c3.py @@ -13,7 +13,7 @@ class C3Dataset(BaseDataset): @staticmethod def load(path: str): - with open(path) as f: + with open(path, 'r', encoding='utf-8') as f: data = json.load(f) rows = [] for _, row in enumerate(data): @@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset): @staticmethod def load(path: str): - with open(path) as f: + with open(path, 'r', encoding='utf-8') as f: raw = json.load(f) data = [] for line in raw: diff --git a/opencompass/datasets/ceval.py b/opencompass/datasets/ceval.py index 366e976b..b9f3476f 100644 --- a/opencompass/datasets/ceval.py +++ b/opencompass/datasets/ceval.py @@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset): def load(path: str, name: str): dataset = {} for split in ['dev', 'val', 'test']: - with open(osp.join(path, split, f'{name}_{split}.csv')) as f: + filename = osp.join(path, split, f'{name}_{split}.csv') + with open(filename, encoding='utf-8') as f: reader = csv.reader(f) header = next(reader) for row in reader: diff --git a/opencompass/datasets/chid.py b/opencompass/datasets/chid.py index 6c218edc..a7a4ae5c 100644 --- a/opencompass/datasets/chid.py +++ b/opencompass/datasets/chid.py @@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) item = {} diff --git a/opencompass/datasets/cluewsc.py b/opencompass/datasets/cluewsc.py index 5f5e0803..8f62b344 100644 --- a/opencompass/datasets/cluewsc.py +++ b/opencompass/datasets/cluewsc.py @@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) item = { diff --git a/opencompass/datasets/cmb.py b/opencompass/datasets/cmb.py index 5f53ec14..684c88f5 100644 --- a/opencompass/datasets/cmb.py +++ b/opencompass/datasets/cmb.py @@ -13,9 +13,9 @@ class CMBDataset(BaseDataset): @staticmethod def load(path: str): - with open(osp.join(path, 'test.json'), 'r') as f: + with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f: test_data = json.load(f) - with open(osp.join(path, 'val.json'), 'r') as f: + with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f: val_data = json.load(f) for da in test_data: diff --git a/opencompass/datasets/cmnli.py b/opencompass/datasets/cmnli.py index 9cd9243c..653148d3 100644 --- a/opencompass/datasets/cmnli.py +++ b/opencompass/datasets/cmnli.py @@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) if line['label'] == '-': diff --git a/opencompass/datasets/cmrc.py b/opencompass/datasets/cmrc.py index bb388976..fcb0a847 100644 --- a/opencompass/datasets/cmrc.py +++ b/opencompass/datasets/cmrc.py @@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset): @staticmethod def load(path: str): - with open(path) as f: + with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # 将原始数据转换为所需的格式 rows = [] diff --git a/opencompass/datasets/copa.py b/opencompass/datasets/copa.py index 34ad4039..3aaa195e 100644 --- a/opencompass/datasets/copa.py +++ b/opencompass/datasets/copa.py @@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset): @staticmethod def load(path): dataset = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) line['label'] = 'AB'[line['label']] diff --git a/opencompass/datasets/csl.py b/opencompass/datasets/csl.py index e9379f4f..1994b44c 100644 --- a/opencompass/datasets/csl.py +++ b/opencompass/datasets/csl.py @@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) item = { diff --git a/opencompass/datasets/drcd.py b/opencompass/datasets/drcd.py index 44466242..66bd0ca9 100644 --- a/opencompass/datasets/drcd.py +++ b/opencompass/datasets/drcd.py @@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset): @staticmethod def load(path: str): - with open(path) as f: + with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # 将原始数据转换为所需的格式 rows = [] diff --git a/opencompass/datasets/eprstmt.py b/opencompass/datasets/eprstmt.py index dd14b960..d333b3cf 100644 --- a/opencompass/datasets/eprstmt.py +++ b/opencompass/datasets/eprstmt.py @@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset): @staticmethod def load(path): data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) item = { diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py index 089a5a7c..b300e598 100644 --- a/opencompass/datasets/gsm8k.py +++ b/opencompass/datasets/gsm8k.py @@ -1,3 +1,4 @@ +from opencompass.openicl import BaseEvaluator from opencompass.registry import TEXT_POSTPROCESSORS @@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str: if ret[i].isdigit(): ret1 += ret[i] return ret1 + + +class Gsm8kEvaluator(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + detail = {'pred': i, 'answers': j, 'correct': False} + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py index f6d2a7ba..4541ca18 100644 --- a/opencompass/datasets/hellaswag.py +++ b/opencompass/datasets/hellaswag.py @@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset): @staticmethod def load(path): dataset = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) dataset.append({ diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index bdd02650..698329e0 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator): } correct = 0 count = 0 + details = [] for i, j in zip(predictions, references): + detail = {'pred': i, 'answer': j, 'correct': False} count += 1 if self.is_equiv(i, j): correct += 1 - result = {'accuracy': 100 * correct / count} + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} return result def _fix_fracs(self, string): diff --git a/opencompass/datasets/natural_question.py b/opencompass/datasets/natural_question.py index 6853b099..9882a1db 100644 --- a/opencompass/datasets/natural_question.py +++ b/opencompass/datasets/natural_question.py @@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator): processed_answers = [[general_postprocess(j).lower() for j in i] for i in references] + details = [] cnt = 0 for pred, cand_ans in zip(processed_predictions, processed_answers): + detail = {'pred': pred, 'answer': cand_ans, 'correct': False} cnt += int(any([cand == pred for cand in cand_ans])) + if int(any([cand == pred for cand in cand_ans])): + detail['correct'] = True + details.append(detail) score = cnt / len(predictions) * 100 - return {'score': score} + return {'score': score, 'details': details} diff --git a/opencompass/datasets/tnews.py b/opencompass/datasets/tnews.py index 79cdc273..606ea40c 100644 --- a/opencompass/datasets/tnews.py +++ b/opencompass/datasets/tnews.py @@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset): } data = [] - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line) item = { diff --git a/opencompass/datasets/triviaqa.py b/opencompass/datasets/triviaqa.py index 122e8707..e4b11bdb 100644 --- a/opencompass/datasets/triviaqa.py +++ b/opencompass/datasets/triviaqa.py @@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator): processed_answers = [[general_postprocess(j).lower() for j in i] for i in references] + details = [] cnt = 0 for pred, cand_ans in zip(processed_predictions, processed_answers): + detail = {'pred': pred, 'answer': cand_ans, 'correct': False} cnt += int(any([cand == pred for cand in cand_ans])) + if int(any([cand == pred for cand in cand_ans])): + detail['correct'] = True + details.append(detail) score = cnt / len(predictions) * 100 - return {'score': score} + return {'score': score, 'details': details} diff --git a/opencompass/models/claude_api/postprocessors.py b/opencompass/models/claude_api/postprocessors.py index 878f1669..3df242cf 100644 --- a/opencompass/models/claude_api/postprocessors.py +++ b/opencompass/models/claude_api/postprocessors.py @@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str: return '' +def flores_postprocess(text: str) -> str: + text = text.strip().split('\n')[-1].strip() + return text + + +def flores_postprocess_chinese(text: str) -> str: + text = text.strip().split('\n')[-1].strip() + import jieba + truncated_text = text.strip().split('\n')[0] + cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip() + cleaned_text = ' '.join(jieba.cut(cleaned_text)) + return cleaned_text + + def record_postprocess(text: str) -> str: match = re.search(r'(?<=refers to )[^.]+', text) diff --git a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py index 169f9966..e8e08128 100644 --- a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py @@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator): for i in references] cnt = 0 + details = [] for pred, ans, origin_ans in zip(predictions, processed_answers, references): + answers = list(set(ans + origin_ans)) + detail = {'pred': pred, 'answer': answers} if pred in ans or pred in origin_ans: cnt += 1 + detail['correct'] = True + else: + detail['correct'] = False + details.append(detail) score = cnt / len(predictions) * 100 - return {'score': score} + return {'score': score, 'details': details} diff --git a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py index fd3fbde7..1775ba12 100644 --- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py @@ -51,8 +51,7 @@ class BaseInferencer: self.output_json_filepath = output_json_filepath self.output_json_filename = output_json_filename self.is_main_process = is_main_process() - if not os.path.exists(self.output_json_filepath): - os.makedirs(self.output_json_filepath) + os.makedirs(self.output_json_filepath, exist_ok=True) def inference(self, retriever: BaseRetriever, diff --git a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py index 606afd86..0fa60bee 100644 --- a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py @@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer): index = 0 prompt_list = [] sub_ppl_list = [] + token_num_list = [] normalizing_prompt_list = [] context_length_list = [] @@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer): mode='ppl')) normalizing_prompt_list.append(normalizing_prompt) prompt_list.append(prompt) + token_num_list.append(prompt_token_num) if normalizing_str is not None: normalizing_str_len = self.model.get_token_len_from_template( @@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer): ice_str = self.model.parse_template(ice[idx], mode='ppl') output_handler.save_prompt_and_ppl( label, prompt.replace(ice_str, ''), prompt, res, index) + output_handler.results_dict[str( + index)][f'label: {str(label)}'][ + 'BPB'] = res * token_num_list[idx] / len( + prompt.replace(ice_str, '').encode()) index = index + 1 ppl.append(sub_ppl_list) diff --git a/opencompass/partitioners/base.py b/opencompass/partitioners/base.py index c6015bbd..b3c4b1ee 100644 --- a/opencompass/partitioners/base.py +++ b/opencompass/partitioners/base.py @@ -1,6 +1,6 @@ from abc import abstractmethod from copy import deepcopy -from typing import Dict, List +from typing import Dict, List, Optional from mmengine.config import ConfigDict @@ -13,16 +13,24 @@ class BasePartitioner: Args: out_dir (str): The output directory of tasks. - keep_keys (List[str]): The keys to be kept from the experiment config - to the task config. + keep_keys (Optional[List[str]], optional): The keys to be kept from the + experiment config to the task config. Defaults to None. If None, + the following keys will be kept: + + - eval.runner.task.judge_cfg + - eval.runner.task.dump_details """ - def __init__(self, - out_dir: str, - keep_keys: List[str] = ['eval.runner.task.judge_cfg']): + def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None): self.logger = get_logger() self.out_dir = out_dir - self.keep_keys = keep_keys + if keep_keys is None: + self.keep_keys = [ + 'eval.runner.task.judge_cfg', + 'eval.runner.task.dump_details', + ] + else: + self.keep_keys = keep_keys def __call__(self, cfg: ConfigDict) -> List[Dict]: """Generate tasks from config. Each task is defined as a @@ -63,7 +71,8 @@ class BasePartitioner: tgt_ptr = tgt_ptr[key] tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]] except Exception: - self.logger.warning(f'Key {k} not found in config, ignored.') + self.logger.debug(f'Key {k} not found in config, ignored.') + self.logger.debug(f'Additional config: {add_cfg}') tasks = self.partition(models, datasets, diff --git a/opencompass/partitioners/naive.py b/opencompass/partitioners/naive.py index 42bfcf57..cc638ad9 100644 --- a/opencompass/partitioners/naive.py +++ b/opencompass/partitioners/naive.py @@ -1,5 +1,5 @@ import os.path as osp -from typing import Dict, List +from typing import Dict, List, Optional from mmengine.config import Config, ConfigDict @@ -11,15 +11,23 @@ from .base import BasePartitioner @PARTITIONERS.register_module() class NaivePartitioner(BasePartitioner): - """Naive task partitioner. This partitioner will generate a task for each - model-dataset pair. + """Naive task partitioner. This partitioner will generate a task for each n + model-dataset pairs. Args: out_dir (str): The output directory of tasks. + n (int): The number of model-dataset pairs in each task. keep_keys (List[str]): The keys to be kept from the experiment config to the task config. """ + def __init__(self, + out_dir: str, + n: int = 1, + keep_keys: Optional[List[str]] = None): + super().__init__(out_dir=out_dir, keep_keys=keep_keys) + self.n = n + def partition(self, models: List[ConfigDict], datasets: List[ConfigDict], @@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner): tasks = [] for model in models: + chunks = [] for dataset in datasets: filename = get_infer_output_path(model, dataset, out_dir) if osp.exists(filename): continue + chunks.append(dataset) + + for i in range(0, len(chunks), self.n): task = Config({ 'models': [model], - 'datasets': [[dataset]], + 'datasets': [chunks[i:i + self.n]], 'work_dir': work_dir, **add_cfg }) diff --git a/opencompass/partitioners/size.py b/opencompass/partitioners/size.py index 3bbd17fa..7e843917 100644 --- a/opencompass/partitioners/size.py +++ b/opencompass/partitioners/size.py @@ -2,7 +2,7 @@ import copy import math import os.path as osp from fnmatch import fnmatch -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import mmengine from mmengine.config import Config, ConfigDict @@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner): max_task_size (int): The maximum size of a task. gen_task_coef (int): The dataset cost measurement coefficient for generation tasks. + strategy (str): The partition strategy. Supported strategies are: + 'heuristic' and 'split'. Defaults to 'heuristic'. + heuristic: split large datasets into several tasks, merge small + datasets into one task. + split: split large datasets into several tasks only. dataset_size_path (str): The path to the dataset size cache file. keep_keys (list[str]): The keys to be kept from the experiment config to the task config. @@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner): out_dir: str, max_task_size: int = 40000, gen_task_coef: int = 20, + strategy: str = 'heuristic', dataset_size_path: str = '.cache/dataset_size.json', - keep_keys: List[str] = ['eval.runner.task.judge_cfg']): + keep_keys: Optional[List[str]] = None): super().__init__(out_dir=out_dir, keep_keys=keep_keys) self.max_task_size = max_task_size self.gen_task_coef = gen_task_coef self.dataset_size_path = dataset_size_path + assert strategy in ('heuristic', 'split'), \ + f'Unsupported partition strategy: {strategy}. '\ + 'Supported strategies are: `heuristic`, `split` .' + self.strategy = strategy def partition(self, models: List[ConfigDict], @@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner): reverse=True) tasks = [] for model in models: - task = Config({ - 'models': [model], - 'datasets': [[]], - 'work_dir': work_dir, - **add_cfg - }) - num_data = 0 + chunks = [] # elements: tuple(size, dataset_chunk) for dataset in datasets: filename = get_infer_output_path(model, dataset, out_dir) - root, ext = osp.splitext(filename) # skip the task if the task output exists if osp.exists(filename): continue dataset_size = self.get_cost(dataset) if dataset_size > self.max_task_size: + root, ext = osp.splitext(filename) dataset_splits = self.split_dataset(dataset) for i, dataset_split in enumerate(dataset_splits): - # skip the task it the task output exists if not osp.exists(f'{root}_{i}{ext}'): - tasks.append( - Config({ - 'models': [model], - 'datasets': [[dataset_split]], - 'work_dir': work_dir, - **add_cfg - })) + chunks.append((self.max_task_size, dataset_split)) else: - if num_data + dataset_size > self.max_task_size: - tasks.append(task) - task = Config({ + chunks.append((dataset_size, dataset)) + + if self.strategy == 'heuristic': + chunks = sorted(chunks, key=lambda x: x[0], reverse=True) + current_size, current_chunks = 0, [] + for index in range(len(chunks)): + current_size += chunks[index][0] + current_chunks.append(chunks[index][1]) + if index == len(chunks) - 1 or current_size + chunks[ + index + 1][0] > self.max_task_size: + tasks.append( + Config({ + 'models': [model], + 'datasets': [current_chunks], + 'work_dir': work_dir, + **add_cfg + })) + current_size, current_chunks = 0, [] + elif self.strategy == 'split': + for _, dataset in chunks: + tasks.append( + Config({ 'models': [model], - 'datasets': [[]], + 'datasets': [[dataset]], 'work_dir': work_dir, **add_cfg - }) - num_data = 0 - task['datasets'][0].append(dataset) - num_data = num_data + dataset_size - if task['datasets'][0]: - tasks.append(task) - + })) return tasks @property diff --git a/opencompass/partitioners/sub_naive.py b/opencompass/partitioners/sub_naive.py index a7417539..d4624b2e 100644 --- a/opencompass/partitioners/sub_naive.py +++ b/opencompass/partitioners/sub_naive.py @@ -23,7 +23,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner): mode: str, out_dir: str, model_pairs: Optional[List[Tuple]] = None, - keep_keys: List[str] = ['eval.runner.task.judge_cfg']): + keep_keys: Optional[List[str]] = None): super().__init__(out_dir=out_dir, keep_keys=keep_keys) assert mode in ['all', 'one_to_n', 'fixed'] self.mode = mode diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index 447a5a18..77b6cdef 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -72,6 +72,7 @@ class DefaultSummarizer: if not osp.exists(filepath): continue result = mmengine.load(filepath) + result.pop('details', None) raw_results[model_abbr][dataset_abbr] = result if 'error' in result: self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}') diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 5538a517..68be3d27 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -1,11 +1,14 @@ import argparse +import copy import fnmatch +import math import os.path as osp +import statistics import time from collections import Counter from inspect import signature from shutil import which -from typing import Optional +from typing import List, Optional import mmengine from mmengine.config import Config, ConfigDict @@ -35,6 +38,8 @@ class OpenICLEvalTask(BaseTask): super().__init__(cfg) self.num_gpus = 0 self.logger = get_logger() + self.dump_details = cfg.get('eval', {}).get('runner', {}).get( + 'task', {}).get('dump_details', False) def get_command(self, cfg_path, template): script_path = __file__ @@ -113,7 +118,7 @@ class OpenICLEvalTask(BaseTask): [sub_preds[str(i)] for i in range(len(sub_preds))]) filename = root + f'_{i}' + ext i += 1 - + pred_dicts = copy.deepcopy(preds) preds = {k: [pred.get(k) for pred in preds] for k in preds[0]} pred_strs = preds.pop('prediction') @@ -163,6 +168,7 @@ class OpenICLEvalTask(BaseTask): ] icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator']) + preds['predictions'] = pred_strs preds['references'] = (test_set[self.output_column] if self.output_column else None) @@ -172,18 +178,42 @@ class OpenICLEvalTask(BaseTask): } result = icl_evaluator.score(**preds) + if self.dump_details: + try: + details = result.pop('details', None) + result['details'] = self.format_details( + pred_strs, test_set[self.output_column], details, + pred_dicts) + result['type'] = result['details'].pop('type', None) + + if 'PPL' in str( + self.dataset_cfg.infer_cfg.inferencer.type): + result['correct_bpb'], result[ + 'incorrect_bpb'] = self.calculate_bpb(pred_dicts) + else: + result['incorrect_bpb'] = result['correct_bpb'] = -1 + except Exception: + result['incorrect_bpb'] = result['correct_bpb'] = -1 + else: + result.pop('details', None) + if 'error' in result: self.logger.error( f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}') return else: - self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}') + result_wo_details = { + i: result[i] + for i in result if i != 'details' + } + self.logger.info( + f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}') # Save result out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg, osp.join(self.work_dir, 'results')) mkdir_or_exist(osp.split(out_path)[0]) - mmengine.dump(result, out_path) + mmengine.dump(result, out_path, ensure_ascii=False, indent=4) def _extract_role_pred(self, s: str, begin_str: Optional[str], end_str: Optional[str]) -> str: @@ -215,6 +245,95 @@ class OpenICLEvalTask(BaseTask): return s[start:end] + def format_details(self, predictions, references, details, pred_dicts): + """This function is responsible for formatting prediction details. + + Args: + predictions (list): The prediction list. + references (list): The reference list. + details (list): Contains the 'pred' 'answer' and 'correct' for each + sample. Such as `[{'pred': '光荣和ωforce', + 'answers': ['光荣和ω-force', '光荣和ωforce'], 'correct': True}]` + pred_dicts (list): Contains a list of samples with the original + prompts. Such as + `[{'origin_prompt': '根据文章回答问题。你的答案应该尽可能3》…………', + 'prediction': ' 光荣和ω-force\n', 'gold': ['光荣和ω-force']}]` + + Returns: + list: The formatted prediction details. + """ + results = {} + for i in range(len(predictions)): + ppl_flag = False + result = {} + origin_prediction = copy.deepcopy(pred_dicts[i]) + origin_prediction.pop('in-context examples', None) + origin_prediction.pop('prediction', None) + keys = copy.deepcopy(list(origin_prediction.keys())) + for key in keys: + if key.startswith('label:'): + ppl_flag = True + origin_prediction[key].pop('testing input', None) + new_key = key.replace('label: ', '') + origin_prediction[new_key] = origin_prediction.pop(key) + if ppl_flag: + results['type'] = 'PPL' + result['origin_prediction'] = origin_prediction + result['predictions'] = str(predictions[i]) + result['references'] = str(references[i]) + result['correct'] = str(predictions[i]) == str(references[i]) + else: + results['type'] = 'GEN' + result['prompt'] = origin_prediction['origin_prompt'] + result['origin_prediction'] = pred_dicts[i]['prediction'] + result['predictions'] = details[i]['pred'] + result['references'] = details[i]['answers'] + result['correct'] = details[i]['correct'] + results[str(i)] = result + return results + + def calculate_bpb(self, pred_dicts: List): + """This function is used to calculate the BPB (Bits Per Byte) for the + data. The correct BPB is obtained directly from the values in the + 'predictions' file. The incorrect BPB is the average of the remaining + BPB values for each sample under different labels after subtracting the + correct BPB. The calculation of BPB (Bits Per Byte) is similar to PPL, + with the difference that it computes the additional bits needed on + average, in terms of character length, to encode the true sequence + based on the predictions. This calculation involves applying a + weighting factor based on the ratio of words to characters. + + Args: + pred_dicts (list): Contains a list of samples with each options + and BPB scores. + + Returns: + dict: Contains correct and incorrect bpb. + """ + incorrect_bpb_list = [] + bpb_list = [] + for pred_dict in pred_dicts: + preds = { + key: value + for key, value in pred_dict.items() + if key.startswith('label: ') + } + values = [] + for item in preds.items(): + values.append(item[1]) + bpbs = [value['BPB'] for value in values] + incorrect_bpb_list.append( + (sum(bpbs) - min(bpbs)) / (len(bpbs) - 1)) + bpb_list.append(statistics.mean(bpbs)) + + def filters(origins): + targets = [target for target in origins if not math.isnan(target)] + return targets + + mean_incorrect = statistics.mean(filters(incorrect_bpb_list)) + mean_correct = statistics.mean(filters(bpb_list)) + return 100 * mean_correct, 100 * mean_incorrect + def parse_args(): parser = argparse.ArgumentParser(description='Score Calculator') diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 288752b5..99d475b9 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -25,6 +25,7 @@ requests==2.31.0 rouge rouge_chinese rouge_score +sacrebleu scikit_learn==1.2.1 seaborn sentence_transformers==2.2.2 diff --git a/run.py b/run.py index f5512a67..fd323c58 100644 --- a/run.py +++ b/run.py @@ -123,6 +123,12 @@ def parse_args(): 'Will be overrideen by the "retry" argument in the config.', type=int, default=2) + parser.add_argument( + '--dump-eval-details', + help='Whether to dump the evaluation details, including the ' + 'correctness of each sample, bpb, etc.', + action='store_true', + ) # set srun args slurm_parser = parser.add_argument_group('slurm_args') parse_slurm_args(slurm_parser) @@ -300,6 +306,8 @@ def main(): if args.dlc or args.slurm or cfg.get('eval', None) is None: fill_eval_cfg(cfg, args) + if args.dump_eval_details: + cfg.eval.runner.task.dump_details = True if args.partition is not None: if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: