mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] update (#517)
This commit is contained in:
parent
6f07af3039
commit
dbb20b8270
5
.gitignore
vendored
5
.gitignore
vendored
@ -11,7 +11,7 @@ configs/eval_debug*.py
|
||||
configs/viz_*.py
|
||||
data
|
||||
work_dirs
|
||||
|
||||
configs/internal/
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
@ -86,3 +86,6 @@ docs/zh_cn/_build/
|
||||
|
||||
# .zip
|
||||
*.zip
|
||||
|
||||
# sft config ignore list
|
||||
configs/sft_cfg/*B_*
|
||||
|
204
configs/datasets/agieval/agieval_gen_397d81.py
Normal file
204
configs/datasets/agieval/agieval_gen_397d81.py
Normal file
@ -0,0 +1,204 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
|
||||
|
||||
agieval_reader_cfg = dict(
|
||||
input_columns=['question', 'options'], output_column='label')
|
||||
|
||||
agieval_single_choice_sets = [
|
||||
'gaokao-chinese',
|
||||
'gaokao-english',
|
||||
'gaokao-geography',
|
||||
'gaokao-history',
|
||||
'gaokao-biology',
|
||||
'gaokao-chemistry',
|
||||
'gaokao-physics',
|
||||
'gaokao-mathqa',
|
||||
'logiqa-zh',
|
||||
'lsat-ar',
|
||||
'lsat-lr',
|
||||
'lsat-rc',
|
||||
'logiqa-en',
|
||||
'sat-math',
|
||||
'sat-en',
|
||||
'sat-en-without-passage',
|
||||
'aqua-rat',
|
||||
]
|
||||
agieval_multiple_choices_sets = [
|
||||
'jec-qa-kd',
|
||||
'jec-qa-ca',
|
||||
]
|
||||
agieval_cloze_sets = ['gaokao-mathcloze', 'math']
|
||||
agieval_chinese_sets = [
|
||||
'gaokao-chinese',
|
||||
'gaokao-english',
|
||||
'gaokao-geography',
|
||||
'gaokao-history',
|
||||
'gaokao-biology',
|
||||
'gaokao-chemistry',
|
||||
'gaokao-physics',
|
||||
'gaokao-mathqa',
|
||||
'logiqa-zh',
|
||||
'gaokao-mathcloze',
|
||||
]
|
||||
agieval_english_sets = [
|
||||
'lsat-ar',
|
||||
'lsat-lr',
|
||||
'lsat-rc',
|
||||
'logiqa-en',
|
||||
'sat-math',
|
||||
'sat-en',
|
||||
'sat-en-without-passage',
|
||||
'aqua-rat',
|
||||
'math',
|
||||
]
|
||||
agieval_gaokao_sets = [
|
||||
'gaokao-chinese',
|
||||
'gaokao-english',
|
||||
'gaokao-geography',
|
||||
'gaokao-history',
|
||||
'gaokao-biology',
|
||||
'gaokao-chemistry',
|
||||
'gaokao-physics',
|
||||
'gaokao-mathqa',
|
||||
]
|
||||
|
||||
agieval_datasets = []
|
||||
for _name in agieval_single_choice_sets:
|
||||
if _name in agieval_chinese_sets:
|
||||
_hint = '答案是: '
|
||||
else:
|
||||
_hint = 'The answer is '
|
||||
agieval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
||||
|
||||
agieval_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess))
|
||||
|
||||
agieval_datasets.append(
|
||||
dict(
|
||||
type=AGIEvalDataset_v2,
|
||||
path='./data/AGIEval/data/v1/',
|
||||
name=_name,
|
||||
abbr='agieval-' + _name,
|
||||
setting_name='zero-shot',
|
||||
reader_cfg=agieval_reader_cfg,
|
||||
infer_cfg=agieval_infer_cfg.copy(),
|
||||
eval_cfg=agieval_eval_cfg.copy()))
|
||||
|
||||
for _name in agieval_multiple_choices_sets:
|
||||
if _name in agieval_chinese_sets:
|
||||
_hint = '答案是: '
|
||||
else:
|
||||
_hint = 'The answer is '
|
||||
agieval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
||||
|
||||
agieval_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess_multi))
|
||||
|
||||
agieval_datasets.append(
|
||||
dict(
|
||||
type=AGIEvalDataset_v2,
|
||||
path='./data/AGIEval/data/v1/',
|
||||
name=_name,
|
||||
abbr='agieval-' + _name,
|
||||
setting_name='zero-shot',
|
||||
reader_cfg=agieval_reader_cfg,
|
||||
infer_cfg=agieval_infer_cfg.copy(),
|
||||
eval_cfg=agieval_eval_cfg.copy()))
|
||||
|
||||
for _name in agieval_cloze_sets:
|
||||
if _name in agieval_chinese_sets:
|
||||
_hint = '答案是: '
|
||||
else:
|
||||
_hint = 'The answer is '
|
||||
agieval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
||||
|
||||
agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
|
||||
|
||||
agieval_datasets.append(
|
||||
dict(
|
||||
type=AGIEvalDataset_v2,
|
||||
path='./data/AGIEval/data/v1/',
|
||||
name=_name,
|
||||
abbr='agieval-' + _name,
|
||||
setting_name='zero-shot',
|
||||
reader_cfg=agieval_reader_cfg,
|
||||
infer_cfg=agieval_infer_cfg.copy(),
|
||||
eval_cfg=agieval_eval_cfg.copy()))
|
||||
|
||||
for _item in agieval_datasets:
|
||||
_name = _item['name']
|
||||
_intro = {
|
||||
'gaokao-chinese':
|
||||
'以下是一道中国高考语文选择题,请选择正确的答案。',
|
||||
'gaokao-english':
|
||||
'以下是一道中国高考英语选择题,请选择正确的答案。',
|
||||
'gaokao-geography':
|
||||
'以下是一道中国高考地理选择题,请选择正确的答案。',
|
||||
'gaokao-history':
|
||||
'以下是一道中国高考历史选择题,请选择正确的答案。',
|
||||
'gaokao-biology':
|
||||
'以下是一道中国高考生物选择题,请选择正确的答案。',
|
||||
'gaokao-chemistry':
|
||||
'以下是一道中国高考化学选择题,请选择正确的答案。',
|
||||
'gaokao-physics':
|
||||
'以下是一道中国高考物理选择题,请选择正确的答案。',
|
||||
'gaokao-mathqa':
|
||||
'以下是一道中国高考数学选择题,请选择正确的答案。',
|
||||
'logiqa-zh':
|
||||
'以下是一道中国公务员考试题,请选择正确的答案。',
|
||||
'lsat-ar':
|
||||
'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
|
||||
'lsat-lr':
|
||||
'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
|
||||
'lsat-rc':
|
||||
'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
|
||||
'logiqa-en':
|
||||
'The following is a Logic Reasoning question. Please select the correct answer.',
|
||||
'sat-math':
|
||||
'The following is a SAT Math question. Please select the correct answer.',
|
||||
'sat-en':
|
||||
'The following is a SAT English question. Please select the correct answer.',
|
||||
'sat-en-without-passage':
|
||||
'The following is a SAT English question. Please select the correct answer.',
|
||||
'aqua-rat':
|
||||
'The following is a AQUA-RAT question. Please select the correct answer.',
|
||||
'jec-qa-kd':
|
||||
'以下是一道中国司法考试基础知识题,请选择正确的答案。',
|
||||
'jec-qa-ca':
|
||||
'以下是一道中国司法考试案例分析题,请选择正确的答案。',
|
||||
'gaokao-mathcloze':
|
||||
'以下是一道中国高考数学填空题,请填入正确的答案。',
|
||||
'math':
|
||||
'The following is a Math question. Please select the correct answer.',
|
||||
}[_name]
|
||||
_templates = _item['infer_cfg']['prompt_template']['template']
|
||||
_templates['round'][0][
|
||||
'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
|
||||
|
||||
del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
|
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
|
||||
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
|
||||
|
||||
agieval_single_choice_sets = [
|
||||
@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
||||
|
||||
agieval_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
evaluator=dict(type=AGIEvalEvaluator_mcq),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess_multi))
|
||||
|
||||
agieval_datasets.append(
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
|
||||
|
||||
@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
|
||||
|
||||
@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
|
||||
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
|
||||
|
||||
|
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
@ -72,7 +72,7 @@ Question: {question}{answer}
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
evaluator=dict(type=Gsm8kEvaluator),
|
||||
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
|
||||
|
||||
|
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import SCInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' )
|
||||
generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40)
|
||||
@ -73,7 +73,7 @@ Question: {question}{answer}
|
||||
inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20))
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
evaluator=dict(type=Gsm8kEvaluator),
|
||||
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
sc_size = 20)
|
||||
|
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
|
||||
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
|
||||
|
@ -1,6 +1,8 @@
|
||||
from opencompass.models.claude_api.claude_api import Claude
|
||||
from opencompass.models.claude_api.postprocessors import (
|
||||
flores_postprocess, gsm8k_postprocess, humaneval_postprocess,
|
||||
lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess)
|
||||
from opencompass.utils.text_postprocessors import last_option_postprocess
|
||||
from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess
|
||||
|
||||
agieval_single_choice_sets = [
|
||||
'gaokao-chinese',
|
||||
@ -47,6 +49,8 @@ claude_postprocessors = {
|
||||
'lcsts': dict(type=lcsts_postprocess),
|
||||
'mbpp': dict(type=mbpp_postprocess),
|
||||
'strategyqa': dict(type=strategyqa_pred_postprocess),
|
||||
'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'),
|
||||
'flores_100_*-zho_simpl': dict(type=flores_postprocess),
|
||||
}
|
||||
|
||||
for _name in agieval_multiple_choices_sets + agieval_single_choice_sets:
|
||||
|
@ -5,7 +5,7 @@
|
||||
The program entry for the evaluation task is `run.py`. The usage is as follows:
|
||||
|
||||
```shell
|
||||
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
|
||||
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
|
||||
```
|
||||
|
||||
Task Configuration (`$EXP`):
|
||||
@ -66,6 +66,7 @@ The parameter explanation is as follows:
|
||||
- `-w`: Specify the working path, default is `./outputs/default`.
|
||||
- `-l`: Enable status reporting via Lark bot.
|
||||
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
|
||||
- `--dump-eval-details`: When enabled,evaluation under the `results` folder will include more details, such as the correctness of each sample.
|
||||
|
||||
Using run mode `-m all` as an example, the overall execution flow is as follows:
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
评测任务的程序入口为 `run.py`,使用方法如下:
|
||||
|
||||
```shell
|
||||
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
|
||||
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
|
||||
```
|
||||
|
||||
任务配置 (`$EXP`):
|
||||
@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
|
||||
- `-w`: 指定工作路径,默认为 `./outputs/default`
|
||||
- `-l`: 打开飞书机器人状态上报。
|
||||
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
|
||||
- `--dump-eval-details`: 开启时,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
|
||||
|
||||
以运行模式 `-m all` 为例,整体运行流如下:
|
||||
|
||||
|
@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
line['label'] = 'AB'[int(line['label'])]
|
||||
|
@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
predictions = [parse_math_answer('', pred) for pred in predictions]
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
detail = {'pred': pred, 'answer': ref, 'correct': False}
|
||||
if is_equiv(pred, ref):
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
score = cnt / len(predictions) * 100
|
||||
return {'score': score}
|
||||
return {'score': score, 'details': details}
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class AGIEvalEvaluator_mcq(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error': 'predictions and references have different '
|
||||
'length'
|
||||
}
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
detail = {'pred': pred, 'answer': ref, 'correct': False}
|
||||
if pred == ref:
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score, 'details': details}
|
||||
|
@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
|
||||
|
||||
predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
|
||||
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
detail = {'pred': pred, 'answer': ref, 'correct': False}
|
||||
if pred == ref:
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score}
|
||||
return {'score': score, 'details': details}
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class BBHEvaluator_mcq(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error': 'predictions and references have different '
|
||||
'length'
|
||||
}
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
detail = {'pred': pred, 'answer': ref, 'correct': False}
|
||||
if pred == ref:
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score, 'details': details}
|
||||
|
@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
line['label'] = 'AB'[int(line['label'])]
|
||||
|
@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
|
||||
with open(path) as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
rows = []
|
||||
for _, row in enumerate(data):
|
||||
@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
with open(path) as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
raw = json.load(f)
|
||||
data = []
|
||||
for line in raw:
|
||||
|
@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
|
||||
def load(path: str, name: str):
|
||||
dataset = {}
|
||||
for split in ['dev', 'val', 'test']:
|
||||
with open(osp.join(path, split, f'{name}_{split}.csv')) as f:
|
||||
filename = osp.join(path, split, f'{name}_{split}.csv')
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
header = next(reader)
|
||||
for row in reader:
|
||||
|
@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
item = {}
|
||||
|
@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
item = {
|
||||
|
@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
with open(osp.join(path, 'test.json'), 'r') as f:
|
||||
with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
|
||||
test_data = json.load(f)
|
||||
with open(osp.join(path, 'val.json'), 'r') as f:
|
||||
with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
|
||||
val_data = json.load(f)
|
||||
|
||||
for da in test_data:
|
||||
|
@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
if line['label'] == '-':
|
||||
|
@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
with open(path) as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
# 将原始数据转换为所需的格式
|
||||
rows = []
|
||||
|
@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
dataset = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
line['label'] = 'AB'[line['label']]
|
||||
|
@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
item = {
|
||||
|
@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
with open(path) as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
# 将原始数据转换为所需的格式
|
||||
rows = []
|
||||
|
@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
item = {
|
||||
|
@ -1,3 +1,4 @@
|
||||
from opencompass.openicl import BaseEvaluator
|
||||
from opencompass.registry import TEXT_POSTPROCESSORS
|
||||
|
||||
|
||||
@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
|
||||
if ret[i].isdigit():
|
||||
ret1 += ret[i]
|
||||
return ret1
|
||||
|
||||
|
||||
class Gsm8kEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error': 'predictions and references have different '
|
||||
'length'
|
||||
}
|
||||
correct = 0
|
||||
count = 0
|
||||
details = []
|
||||
for i, j in zip(predictions, references):
|
||||
detail = {'pred': i, 'answers': j, 'correct': False}
|
||||
count += 1
|
||||
if i == j:
|
||||
correct += 1
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
result = {'accuracy': 100 * correct / count, 'details': details}
|
||||
return result
|
||||
|
@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path):
|
||||
dataset = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
dataset.append({
|
||||
|
@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
|
||||
}
|
||||
correct = 0
|
||||
count = 0
|
||||
details = []
|
||||
for i, j in zip(predictions, references):
|
||||
detail = {'pred': i, 'answer': j, 'correct': False}
|
||||
count += 1
|
||||
if self.is_equiv(i, j):
|
||||
correct += 1
|
||||
result = {'accuracy': 100 * correct / count}
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
result = {'accuracy': 100 * correct / count, 'details': details}
|
||||
return result
|
||||
|
||||
def _fix_fracs(self, string):
|
||||
|
@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
|
||||
processed_answers = [[general_postprocess(j).lower() for j in i]
|
||||
for i in references]
|
||||
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, cand_ans in zip(processed_predictions, processed_answers):
|
||||
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
|
||||
cnt += int(any([cand == pred for cand in cand_ans]))
|
||||
if int(any([cand == pred for cand in cand_ans])):
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score}
|
||||
return {'score': score, 'details': details}
|
||||
|
@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
|
||||
}
|
||||
|
||||
data = []
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
item = {
|
||||
|
@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
|
||||
processed_answers = [[general_postprocess(j).lower() for j in i]
|
||||
for i in references]
|
||||
|
||||
details = []
|
||||
cnt = 0
|
||||
for pred, cand_ans in zip(processed_predictions, processed_answers):
|
||||
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
|
||||
cnt += int(any([cand == pred for cand in cand_ans]))
|
||||
if int(any([cand == pred for cand in cand_ans])):
|
||||
detail['correct'] = True
|
||||
details.append(detail)
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score}
|
||||
return {'score': score, 'details': details}
|
||||
|
@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
def flores_postprocess(text: str) -> str:
|
||||
text = text.strip().split('\n')[-1].strip()
|
||||
return text
|
||||
|
||||
|
||||
def flores_postprocess_chinese(text: str) -> str:
|
||||
text = text.strip().split('\n')[-1].strip()
|
||||
import jieba
|
||||
truncated_text = text.strip().split('\n')[0]
|
||||
cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
|
||||
cleaned_text = ' '.join(jieba.cut(cleaned_text))
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def record_postprocess(text: str) -> str:
|
||||
match = re.search(r'(?<=refers to )[^.]+', text)
|
||||
|
||||
|
@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
|
||||
for i in references]
|
||||
|
||||
cnt = 0
|
||||
details = []
|
||||
for pred, ans, origin_ans in zip(predictions, processed_answers,
|
||||
references):
|
||||
answers = list(set(ans + origin_ans))
|
||||
detail = {'pred': pred, 'answer': answers}
|
||||
if pred in ans or pred in origin_ans:
|
||||
cnt += 1
|
||||
detail['correct'] = True
|
||||
else:
|
||||
detail['correct'] = False
|
||||
details.append(detail)
|
||||
|
||||
score = cnt / len(predictions) * 100
|
||||
|
||||
return {'score': score}
|
||||
return {'score': score, 'details': details}
|
||||
|
@ -51,8 +51,7 @@ class BaseInferencer:
|
||||
self.output_json_filepath = output_json_filepath
|
||||
self.output_json_filename = output_json_filename
|
||||
self.is_main_process = is_main_process()
|
||||
if not os.path.exists(self.output_json_filepath):
|
||||
os.makedirs(self.output_json_filepath)
|
||||
os.makedirs(self.output_json_filepath, exist_ok=True)
|
||||
|
||||
def inference(self,
|
||||
retriever: BaseRetriever,
|
||||
|
@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
|
||||
index = 0
|
||||
prompt_list = []
|
||||
sub_ppl_list = []
|
||||
token_num_list = []
|
||||
normalizing_prompt_list = []
|
||||
context_length_list = []
|
||||
|
||||
@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
|
||||
mode='ppl'))
|
||||
normalizing_prompt_list.append(normalizing_prompt)
|
||||
prompt_list.append(prompt)
|
||||
token_num_list.append(prompt_token_num)
|
||||
|
||||
if normalizing_str is not None:
|
||||
normalizing_str_len = self.model.get_token_len_from_template(
|
||||
@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
|
||||
ice_str = self.model.parse_template(ice[idx], mode='ppl')
|
||||
output_handler.save_prompt_and_ppl(
|
||||
label, prompt.replace(ice_str, ''), prompt, res, index)
|
||||
output_handler.results_dict[str(
|
||||
index)][f'label: {str(label)}'][
|
||||
'BPB'] = res * token_num_list[idx] / len(
|
||||
prompt.replace(ice_str, '').encode())
|
||||
index = index + 1
|
||||
ppl.append(sub_ppl_list)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from abc import abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from mmengine.config import ConfigDict
|
||||
|
||||
@ -13,16 +13,24 @@ class BasePartitioner:
|
||||
|
||||
Args:
|
||||
out_dir (str): The output directory of tasks.
|
||||
keep_keys (List[str]): The keys to be kept from the experiment config
|
||||
to the task config.
|
||||
keep_keys (Optional[List[str]], optional): The keys to be kept from the
|
||||
experiment config to the task config. Defaults to None. If None,
|
||||
the following keys will be kept:
|
||||
|
||||
- eval.runner.task.judge_cfg
|
||||
- eval.runner.task.dump_details
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
out_dir: str,
|
||||
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
|
||||
def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
|
||||
self.logger = get_logger()
|
||||
self.out_dir = out_dir
|
||||
self.keep_keys = keep_keys
|
||||
if keep_keys is None:
|
||||
self.keep_keys = [
|
||||
'eval.runner.task.judge_cfg',
|
||||
'eval.runner.task.dump_details',
|
||||
]
|
||||
else:
|
||||
self.keep_keys = keep_keys
|
||||
|
||||
def __call__(self, cfg: ConfigDict) -> List[Dict]:
|
||||
"""Generate tasks from config. Each task is defined as a
|
||||
@ -63,7 +71,8 @@ class BasePartitioner:
|
||||
tgt_ptr = tgt_ptr[key]
|
||||
tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
|
||||
except Exception:
|
||||
self.logger.warning(f'Key {k} not found in config, ignored.')
|
||||
self.logger.debug(f'Key {k} not found in config, ignored.')
|
||||
self.logger.debug(f'Additional config: {add_cfg}')
|
||||
|
||||
tasks = self.partition(models,
|
||||
datasets,
|
||||
|
@ -1,5 +1,5 @@
|
||||
import os.path as osp
|
||||
from typing import Dict, List
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from mmengine.config import Config, ConfigDict
|
||||
|
||||
@ -11,15 +11,23 @@ from .base import BasePartitioner
|
||||
|
||||
@PARTITIONERS.register_module()
|
||||
class NaivePartitioner(BasePartitioner):
|
||||
"""Naive task partitioner. This partitioner will generate a task for each
|
||||
model-dataset pair.
|
||||
"""Naive task partitioner. This partitioner will generate a task for each n
|
||||
model-dataset pairs.
|
||||
|
||||
Args:
|
||||
out_dir (str): The output directory of tasks.
|
||||
n (int): The number of model-dataset pairs in each task.
|
||||
keep_keys (List[str]): The keys to be kept from the experiment config
|
||||
to the task config.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
out_dir: str,
|
||||
n: int = 1,
|
||||
keep_keys: Optional[List[str]] = None):
|
||||
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
|
||||
self.n = n
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
datasets: List[ConfigDict],
|
||||
@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):
|
||||
|
||||
tasks = []
|
||||
for model in models:
|
||||
chunks = []
|
||||
for dataset in datasets:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
chunks.append(dataset)
|
||||
|
||||
for i in range(0, len(chunks), self.n):
|
||||
task = Config({
|
||||
'models': [model],
|
||||
'datasets': [[dataset]],
|
||||
'datasets': [chunks[i:i + self.n]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
})
|
||||
|
@ -2,7 +2,7 @@ import copy
|
||||
import math
|
||||
import os.path as osp
|
||||
from fnmatch import fnmatch
|
||||
from typing import Dict, List, Tuple, Union
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import mmengine
|
||||
from mmengine.config import Config, ConfigDict
|
||||
@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
|
||||
max_task_size (int): The maximum size of a task.
|
||||
gen_task_coef (int): The dataset cost measurement coefficient for
|
||||
generation tasks.
|
||||
strategy (str): The partition strategy. Supported strategies are:
|
||||
'heuristic' and 'split'. Defaults to 'heuristic'.
|
||||
heuristic: split large datasets into several tasks, merge small
|
||||
datasets into one task.
|
||||
split: split large datasets into several tasks only.
|
||||
dataset_size_path (str): The path to the dataset size cache file.
|
||||
keep_keys (list[str]): The keys to be kept from the experiment config
|
||||
to the task config.
|
||||
@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
|
||||
out_dir: str,
|
||||
max_task_size: int = 40000,
|
||||
gen_task_coef: int = 20,
|
||||
strategy: str = 'heuristic',
|
||||
dataset_size_path: str = '.cache/dataset_size.json',
|
||||
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
|
||||
keep_keys: Optional[List[str]] = None):
|
||||
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
|
||||
self.max_task_size = max_task_size
|
||||
self.gen_task_coef = gen_task_coef
|
||||
self.dataset_size_path = dataset_size_path
|
||||
assert strategy in ('heuristic', 'split'), \
|
||||
f'Unsupported partition strategy: {strategy}. '\
|
||||
'Supported strategies are: `heuristic`, `split` .'
|
||||
self.strategy = strategy
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
|
||||
reverse=True)
|
||||
tasks = []
|
||||
for model in models:
|
||||
task = Config({
|
||||
'models': [model],
|
||||
'datasets': [[]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
})
|
||||
num_data = 0
|
||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||
for dataset in datasets:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
root, ext = osp.splitext(filename)
|
||||
# skip the task if the task output exists
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
dataset_size = self.get_cost(dataset)
|
||||
if dataset_size > self.max_task_size:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
# skip the task it the task output exists
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [[dataset_split]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
chunks.append((self.max_task_size, dataset_split))
|
||||
else:
|
||||
if num_data + dataset_size > self.max_task_size:
|
||||
tasks.append(task)
|
||||
task = Config({
|
||||
chunks.append((dataset_size, dataset))
|
||||
|
||||
if self.strategy == 'heuristic':
|
||||
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
||||
current_size, current_chunks = 0, []
|
||||
for index in range(len(chunks)):
|
||||
current_size += chunks[index][0]
|
||||
current_chunks.append(chunks[index][1])
|
||||
if index == len(chunks) - 1 or current_size + chunks[
|
||||
index + 1][0] > self.max_task_size:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [current_chunks],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
current_size, current_chunks = 0, []
|
||||
elif self.strategy == 'split':
|
||||
for _, dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [[]],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
})
|
||||
num_data = 0
|
||||
task['datasets'][0].append(dataset)
|
||||
num_data = num_data + dataset_size
|
||||
if task['datasets'][0]:
|
||||
tasks.append(task)
|
||||
|
||||
}))
|
||||
return tasks
|
||||
|
||||
@property
|
||||
|
@ -23,7 +23,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
|
||||
mode: str,
|
||||
out_dir: str,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
|
||||
keep_keys: Optional[List[str]] = None):
|
||||
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
|
||||
assert mode in ['all', 'one_to_n', 'fixed']
|
||||
self.mode = mode
|
||||
|
@ -72,6 +72,7 @@ class DefaultSummarizer:
|
||||
if not osp.exists(filepath):
|
||||
continue
|
||||
result = mmengine.load(filepath)
|
||||
result.pop('details', None)
|
||||
raw_results[model_abbr][dataset_abbr] = result
|
||||
if 'error' in result:
|
||||
self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
|
||||
|
@ -1,11 +1,14 @@
|
||||
import argparse
|
||||
import copy
|
||||
import fnmatch
|
||||
import math
|
||||
import os.path as osp
|
||||
import statistics
|
||||
import time
|
||||
from collections import Counter
|
||||
from inspect import signature
|
||||
from shutil import which
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import mmengine
|
||||
from mmengine.config import Config, ConfigDict
|
||||
@ -35,6 +38,8 @@ class OpenICLEvalTask(BaseTask):
|
||||
super().__init__(cfg)
|
||||
self.num_gpus = 0
|
||||
self.logger = get_logger()
|
||||
self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
|
||||
'task', {}).get('dump_details', False)
|
||||
|
||||
def get_command(self, cfg_path, template):
|
||||
script_path = __file__
|
||||
@ -113,7 +118,7 @@ class OpenICLEvalTask(BaseTask):
|
||||
[sub_preds[str(i)] for i in range(len(sub_preds))])
|
||||
filename = root + f'_{i}' + ext
|
||||
i += 1
|
||||
|
||||
pred_dicts = copy.deepcopy(preds)
|
||||
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
|
||||
|
||||
pred_strs = preds.pop('prediction')
|
||||
@ -163,6 +168,7 @@ class OpenICLEvalTask(BaseTask):
|
||||
]
|
||||
|
||||
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
|
||||
|
||||
preds['predictions'] = pred_strs
|
||||
preds['references'] = (test_set[self.output_column]
|
||||
if self.output_column else None)
|
||||
@ -172,18 +178,42 @@ class OpenICLEvalTask(BaseTask):
|
||||
}
|
||||
result = icl_evaluator.score(**preds)
|
||||
|
||||
if self.dump_details:
|
||||
try:
|
||||
details = result.pop('details', None)
|
||||
result['details'] = self.format_details(
|
||||
pred_strs, test_set[self.output_column], details,
|
||||
pred_dicts)
|
||||
result['type'] = result['details'].pop('type', None)
|
||||
|
||||
if 'PPL' in str(
|
||||
self.dataset_cfg.infer_cfg.inferencer.type):
|
||||
result['correct_bpb'], result[
|
||||
'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
|
||||
else:
|
||||
result['incorrect_bpb'] = result['correct_bpb'] = -1
|
||||
except Exception:
|
||||
result['incorrect_bpb'] = result['correct_bpb'] = -1
|
||||
else:
|
||||
result.pop('details', None)
|
||||
|
||||
if 'error' in result:
|
||||
self.logger.error(
|
||||
f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
|
||||
return
|
||||
else:
|
||||
self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
|
||||
result_wo_details = {
|
||||
i: result[i]
|
||||
for i in result if i != 'details'
|
||||
}
|
||||
self.logger.info(
|
||||
f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
|
||||
|
||||
# Save result
|
||||
out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
|
||||
osp.join(self.work_dir, 'results'))
|
||||
mkdir_or_exist(osp.split(out_path)[0])
|
||||
mmengine.dump(result, out_path)
|
||||
mmengine.dump(result, out_path, ensure_ascii=False, indent=4)
|
||||
|
||||
def _extract_role_pred(self, s: str, begin_str: Optional[str],
|
||||
end_str: Optional[str]) -> str:
|
||||
@ -215,6 +245,95 @@ class OpenICLEvalTask(BaseTask):
|
||||
|
||||
return s[start:end]
|
||||
|
||||
def format_details(self, predictions, references, details, pred_dicts):
|
||||
"""This function is responsible for formatting prediction details.
|
||||
|
||||
Args:
|
||||
predictions (list): The prediction list.
|
||||
references (list): The reference list.
|
||||
details (list): Contains the 'pred' 'answer' and 'correct' for each
|
||||
sample. Such as `[{'pred': '光荣和ωforce',
|
||||
'answers': ['光荣和ω-force', '光荣和ωforce'], 'correct': True}]`
|
||||
pred_dicts (list): Contains a list of samples with the original
|
||||
prompts. Such as
|
||||
`[{'origin_prompt': '根据文章回答问题。你的答案应该尽可能3》…………',
|
||||
'prediction': ' 光荣和ω-force\n', 'gold': ['光荣和ω-force']}]`
|
||||
|
||||
Returns:
|
||||
list: The formatted prediction details.
|
||||
"""
|
||||
results = {}
|
||||
for i in range(len(predictions)):
|
||||
ppl_flag = False
|
||||
result = {}
|
||||
origin_prediction = copy.deepcopy(pred_dicts[i])
|
||||
origin_prediction.pop('in-context examples', None)
|
||||
origin_prediction.pop('prediction', None)
|
||||
keys = copy.deepcopy(list(origin_prediction.keys()))
|
||||
for key in keys:
|
||||
if key.startswith('label:'):
|
||||
ppl_flag = True
|
||||
origin_prediction[key].pop('testing input', None)
|
||||
new_key = key.replace('label: ', '')
|
||||
origin_prediction[new_key] = origin_prediction.pop(key)
|
||||
if ppl_flag:
|
||||
results['type'] = 'PPL'
|
||||
result['origin_prediction'] = origin_prediction
|
||||
result['predictions'] = str(predictions[i])
|
||||
result['references'] = str(references[i])
|
||||
result['correct'] = str(predictions[i]) == str(references[i])
|
||||
else:
|
||||
results['type'] = 'GEN'
|
||||
result['prompt'] = origin_prediction['origin_prompt']
|
||||
result['origin_prediction'] = pred_dicts[i]['prediction']
|
||||
result['predictions'] = details[i]['pred']
|
||||
result['references'] = details[i]['answers']
|
||||
result['correct'] = details[i]['correct']
|
||||
results[str(i)] = result
|
||||
return results
|
||||
|
||||
def calculate_bpb(self, pred_dicts: List):
|
||||
"""This function is used to calculate the BPB (Bits Per Byte) for the
|
||||
data. The correct BPB is obtained directly from the values in the
|
||||
'predictions' file. The incorrect BPB is the average of the remaining
|
||||
BPB values for each sample under different labels after subtracting the
|
||||
correct BPB. The calculation of BPB (Bits Per Byte) is similar to PPL,
|
||||
with the difference that it computes the additional bits needed on
|
||||
average, in terms of character length, to encode the true sequence
|
||||
based on the predictions. This calculation involves applying a
|
||||
weighting factor based on the ratio of words to characters.
|
||||
|
||||
Args:
|
||||
pred_dicts (list): Contains a list of samples with each options
|
||||
and BPB scores.
|
||||
|
||||
Returns:
|
||||
dict: Contains correct and incorrect bpb.
|
||||
"""
|
||||
incorrect_bpb_list = []
|
||||
bpb_list = []
|
||||
for pred_dict in pred_dicts:
|
||||
preds = {
|
||||
key: value
|
||||
for key, value in pred_dict.items()
|
||||
if key.startswith('label: ')
|
||||
}
|
||||
values = []
|
||||
for item in preds.items():
|
||||
values.append(item[1])
|
||||
bpbs = [value['BPB'] for value in values]
|
||||
incorrect_bpb_list.append(
|
||||
(sum(bpbs) - min(bpbs)) / (len(bpbs) - 1))
|
||||
bpb_list.append(statistics.mean(bpbs))
|
||||
|
||||
def filters(origins):
|
||||
targets = [target for target in origins if not math.isnan(target)]
|
||||
return targets
|
||||
|
||||
mean_incorrect = statistics.mean(filters(incorrect_bpb_list))
|
||||
mean_correct = statistics.mean(filters(bpb_list))
|
||||
return 100 * mean_correct, 100 * mean_incorrect
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Score Calculator')
|
||||
|
@ -25,6 +25,7 @@ requests==2.31.0
|
||||
rouge
|
||||
rouge_chinese
|
||||
rouge_score
|
||||
sacrebleu
|
||||
scikit_learn==1.2.1
|
||||
seaborn
|
||||
sentence_transformers==2.2.2
|
||||
|
8
run.py
8
run.py
@ -123,6 +123,12 @@ def parse_args():
|
||||
'Will be overrideen by the "retry" argument in the config.',
|
||||
type=int,
|
||||
default=2)
|
||||
parser.add_argument(
|
||||
'--dump-eval-details',
|
||||
help='Whether to dump the evaluation details, including the '
|
||||
'correctness of each sample, bpb, etc.',
|
||||
action='store_true',
|
||||
)
|
||||
# set srun args
|
||||
slurm_parser = parser.add_argument_group('slurm_args')
|
||||
parse_slurm_args(slurm_parser)
|
||||
@ -300,6 +306,8 @@ def main():
|
||||
|
||||
if args.dlc or args.slurm or cfg.get('eval', None) is None:
|
||||
fill_eval_cfg(cfg, args)
|
||||
if args.dump_eval_details:
|
||||
cfg.eval.runner.task.dump_details = True
|
||||
|
||||
if args.partition is not None:
|
||||
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
|
||||
|
Loading…
Reference in New Issue
Block a user