[Sync] update (#517)

This commit is contained in:
Fengzhe Zhou 2023-10-27 20:31:22 +08:00 committed by GitHub
parent 6f07af3039
commit dbb20b8270
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 580 additions and 89 deletions

5
.gitignore vendored
View File

@ -11,7 +11,7 @@ configs/eval_debug*.py
configs/viz_*.py
data
work_dirs
configs/internal/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -86,3 +86,6 @@ docs/zh_cn/_build/
# .zip
*.zip
# sft config ignore list
configs/sft_cfg/*B_*

View File

@ -0,0 +1,204 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
agieval_reader_cfg = dict(
input_columns=['question', 'options'], output_column='label')
agieval_single_choice_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'logiqa-zh',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'logiqa-en',
'sat-math',
'sat-en',
'sat-en-without-passage',
'aqua-rat',
]
agieval_multiple_choices_sets = [
'jec-qa-kd',
'jec-qa-ca',
]
agieval_cloze_sets = ['gaokao-mathcloze', 'math']
agieval_chinese_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'logiqa-zh',
'gaokao-mathcloze',
]
agieval_english_sets = [
'lsat-ar',
'lsat-lr',
'lsat-rc',
'logiqa-en',
'sat-math',
'sat-en',
'sat-en-without-passage',
'aqua-rat',
'math',
]
agieval_gaokao_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
]
agieval_datasets = []
for _name in agieval_single_choice_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _name in agieval_multiple_choices_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess_multi))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _name in agieval_cloze_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _item in agieval_datasets:
_name = _item['name']
_intro = {
'gaokao-chinese':
'以下是一道中国高考语文选择题,请选择正确的答案。',
'gaokao-english':
'以下是一道中国高考英语选择题,请选择正确的答案。',
'gaokao-geography':
'以下是一道中国高考地理选择题,请选择正确的答案。',
'gaokao-history':
'以下是一道中国高考历史选择题,请选择正确的答案。',
'gaokao-biology':
'以下是一道中国高考生物选择题,请选择正确的答案。',
'gaokao-chemistry':
'以下是一道中国高考化学选择题,请选择正确的答案。',
'gaokao-physics':
'以下是一道中国高考物理选择题,请选择正确的答案。',
'gaokao-mathqa':
'以下是一道中国高考数学选择题,请选择正确的答案。',
'logiqa-zh':
'以下是一道中国公务员考试题,请选择正确的答案。',
'lsat-ar':
'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
'lsat-lr':
'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
'lsat-rc':
'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
'logiqa-en':
'The following is a Logic Reasoning question. Please select the correct answer.',
'sat-math':
'The following is a SAT Math question. Please select the correct answer.',
'sat-en':
'The following is a SAT English question. Please select the correct answer.',
'sat-en-without-passage':
'The following is a SAT English question. Please select the correct answer.',
'aqua-rat':
'The following is a AQUA-RAT question. Please select the correct answer.',
'jec-qa-kd':
'以下是一道中国司法考试基础知识题,请选择正确的答案。',
'jec-qa-ca':
'以下是一道中国司法考试案例分析题,请选择正确的答案。',
'gaokao-mathcloze':
'以下是一道中国高考数学填空题,请填入正确的答案。',
'math':
'The following is a Math question. Please select the correct answer.',
}[_name]
_templates = _item['infer_cfg']['prompt_template']['template']
_templates['round'][0][
'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
agieval_single_choice_sets = [
@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=AGIEvalEvaluator_mcq),
pred_postprocessor=dict(type=first_capital_postprocess_multi))
agieval_datasets.append(

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=BBHEvaluator_mcq),
pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=BBHEvaluator_mcq),
pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
@ -72,7 +72,7 @@ Question: {question}{answer}
inferencer=dict(type=GenInferencer, max_out_len=512))
gsm8k_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import SCInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' )
generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40)
@ -73,7 +73,7 @@ Question: {question}{answer}
inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20))
gsm8k_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
sc_size = 20)

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

View File

@ -1,6 +1,8 @@
from opencompass.models.claude_api.claude_api import Claude
from opencompass.models.claude_api.postprocessors import (
flores_postprocess, gsm8k_postprocess, humaneval_postprocess,
lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess)
from opencompass.utils.text_postprocessors import last_option_postprocess
from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess
agieval_single_choice_sets = [
'gaokao-chinese',
@ -47,6 +49,8 @@ claude_postprocessors = {
'lcsts': dict(type=lcsts_postprocess),
'mbpp': dict(type=mbpp_postprocess),
'strategyqa': dict(type=strategyqa_pred_postprocess),
'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'),
'flores_100_*-zho_simpl': dict(type=flores_postprocess),
}
for _name in agieval_multiple_choices_sets + agieval_single_choice_sets:

View File

@ -5,7 +5,7 @@
The program entry for the evaluation task is `run.py`. The usage is as follows:
```shell
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
```
Task Configuration (`$EXP`):
@ -66,6 +66,7 @@ The parameter explanation is as follows:
- `-w`: Specify the working path, default is `./outputs/default`.
- `-l`: Enable status reporting via Lark bot.
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabledevaluation under the `results` folder will include more details, such as the correctness of each sample.
Using run mode `-m all` as an example, the overall execution flow is as follows:

View File

@ -5,7 +5,7 @@
评测任务的程序入口为 `run.py`,使用方法如下:
```shell
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run]
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
```
任务配置 (`$EXP`)
@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- `-w`: 指定工作路径,默认为 `./outputs/default`
- `-l`: 打开飞书机器人状态上报。
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
- `--dump-eval-details`: 开启时,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
以运行模式 `-m all` 为例,整体运行流如下:

View File

@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
line['label'] = 'AB'[int(line['label'])]

View File

@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
def score(self, predictions, references):
predictions = [parse_math_answer('', pred) for pred in predictions]
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if is_equiv(pred, ref):
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score}
return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
class AGIEvalEvaluator_mcq(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref:
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score, 'details': details}

View File

@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref:
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score}
return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
class BBHEvaluator_mcq(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref:
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score, 'details': details}

View File

@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
line['label'] = 'AB'[int(line['label'])]

View File

@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
rows = []
for _, row in enumerate(data):
@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
with open(path, 'r', encoding='utf-8') as f:
raw = json.load(f)
data = []
for line in raw:

View File

@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
def load(path: str, name: str):
dataset = {}
for split in ['dev', 'val', 'test']:
with open(osp.join(path, split, f'{name}_{split}.csv')) as f:
filename = osp.join(path, split, f'{name}_{split}.csv')
with open(filename, encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:

View File

@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
item = {}

View File

@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
item = {

View File

@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(osp.join(path, 'test.json'), 'r') as f:
with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
test_data = json.load(f)
with open(osp.join(path, 'val.json'), 'r') as f:
with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
val_data = json.load(f)
for da in test_data:

View File

@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
if line['label'] == '-':

View File

@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 将原始数据转换为所需的格式
rows = []

View File

@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
@staticmethod
def load(path):
dataset = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
line['label'] = 'AB'[line['label']]

View File

@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
item = {

View File

@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 将原始数据转换为所需的格式
rows = []

View File

@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
item = {

View File

@ -1,3 +1,4 @@
from opencompass.openicl import BaseEvaluator
from opencompass.registry import TEXT_POSTPROCESSORS
@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
if ret[i].isdigit():
ret1 += ret[i]
return ret1
class Gsm8kEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
detail = {'pred': i, 'answers': j, 'correct': False}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result

View File

@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
@staticmethod
def load(path):
dataset = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
dataset.append({

View File

@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
if self.is_equiv(i, j):
correct += 1
result = {'accuracy': 100 * correct / count}
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
def _fix_fracs(self, string):

View File

@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references]
details = []
cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score}
return {'score': score, 'details': details}

View File

@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
}
data = []
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
item = {

View File

@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references]
details = []
cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score}
return {'score': score, 'details': details}

View File

@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
return ''
def flores_postprocess(text: str) -> str:
text = text.strip().split('\n')[-1].strip()
return text
def flores_postprocess_chinese(text: str) -> str:
text = text.strip().split('\n')[-1].strip()
import jieba
truncated_text = text.strip().split('\n')[0]
cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
cleaned_text = ' '.join(jieba.cut(cleaned_text))
return cleaned_text
def record_postprocess(text: str) -> str:
match = re.search(r'(?<=refers to )[^.]+', text)

View File

@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
for i in references]
cnt = 0
details = []
for pred, ans, origin_ans in zip(predictions, processed_answers,
references):
answers = list(set(ans + origin_ans))
detail = {'pred': pred, 'answer': answers}
if pred in ans or pred in origin_ans:
cnt += 1
detail['correct'] = True
else:
detail['correct'] = False
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score}
return {'score': score, 'details': details}

View File

@ -51,8 +51,7 @@ class BaseInferencer:
self.output_json_filepath = output_json_filepath
self.output_json_filename = output_json_filename
self.is_main_process = is_main_process()
if not os.path.exists(self.output_json_filepath):
os.makedirs(self.output_json_filepath)
os.makedirs(self.output_json_filepath, exist_ok=True)
def inference(self,
retriever: BaseRetriever,

View File

@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
index = 0
prompt_list = []
sub_ppl_list = []
token_num_list = []
normalizing_prompt_list = []
context_length_list = []
@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
mode='ppl'))
normalizing_prompt_list.append(normalizing_prompt)
prompt_list.append(prompt)
token_num_list.append(prompt_token_num)
if normalizing_str is not None:
normalizing_str_len = self.model.get_token_len_from_template(
@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
ice_str = self.model.parse_template(ice[idx], mode='ppl')
output_handler.save_prompt_and_ppl(
label, prompt.replace(ice_str, ''), prompt, res, index)
output_handler.results_dict[str(
index)][f'label: {str(label)}'][
'BPB'] = res * token_num_list[idx] / len(
prompt.replace(ice_str, '').encode())
index = index + 1
ppl.append(sub_ppl_list)

View File

@ -1,6 +1,6 @@
from abc import abstractmethod
from copy import deepcopy
from typing import Dict, List
from typing import Dict, List, Optional
from mmengine.config import ConfigDict
@ -13,16 +13,24 @@ class BasePartitioner:
Args:
out_dir (str): The output directory of tasks.
keep_keys (List[str]): The keys to be kept from the experiment config
to the task config.
keep_keys (Optional[List[str]], optional): The keys to be kept from the
experiment config to the task config. Defaults to None. If None,
the following keys will be kept:
- eval.runner.task.judge_cfg
- eval.runner.task.dump_details
"""
def __init__(self,
out_dir: str,
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
self.logger = get_logger()
self.out_dir = out_dir
self.keep_keys = keep_keys
if keep_keys is None:
self.keep_keys = [
'eval.runner.task.judge_cfg',
'eval.runner.task.dump_details',
]
else:
self.keep_keys = keep_keys
def __call__(self, cfg: ConfigDict) -> List[Dict]:
"""Generate tasks from config. Each task is defined as a
@ -63,7 +71,8 @@ class BasePartitioner:
tgt_ptr = tgt_ptr[key]
tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
except Exception:
self.logger.warning(f'Key {k} not found in config, ignored.')
self.logger.debug(f'Key {k} not found in config, ignored.')
self.logger.debug(f'Additional config: {add_cfg}')
tasks = self.partition(models,
datasets,

View File

@ -1,5 +1,5 @@
import os.path as osp
from typing import Dict, List
from typing import Dict, List, Optional
from mmengine.config import Config, ConfigDict
@ -11,15 +11,23 @@ from .base import BasePartitioner
@PARTITIONERS.register_module()
class NaivePartitioner(BasePartitioner):
"""Naive task partitioner. This partitioner will generate a task for each
model-dataset pair.
"""Naive task partitioner. This partitioner will generate a task for each n
model-dataset pairs.
Args:
out_dir (str): The output directory of tasks.
n (int): The number of model-dataset pairs in each task.
keep_keys (List[str]): The keys to be kept from the experiment config
to the task config.
"""
def __init__(self,
out_dir: str,
n: int = 1,
keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.n = n
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):
tasks = []
for model in models:
chunks = []
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
if osp.exists(filename):
continue
chunks.append(dataset)
for i in range(0, len(chunks), self.n):
task = Config({
'models': [model],
'datasets': [[dataset]],
'datasets': [chunks[i:i + self.n]],
'work_dir': work_dir,
**add_cfg
})

View File

@ -2,7 +2,7 @@ import copy
import math
import os.path as osp
from fnmatch import fnmatch
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union
import mmengine
from mmengine.config import Config, ConfigDict
@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
max_task_size (int): The maximum size of a task.
gen_task_coef (int): The dataset cost measurement coefficient for
generation tasks.
strategy (str): The partition strategy. Supported strategies are:
'heuristic' and 'split'. Defaults to 'heuristic'.
heuristic: split large datasets into several tasks, merge small
datasets into one task.
split: split large datasets into several tasks only.
dataset_size_path (str): The path to the dataset size cache file.
keep_keys (list[str]): The keys to be kept from the experiment config
to the task config.
@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
out_dir: str,
max_task_size: int = 40000,
gen_task_coef: int = 20,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.max_task_size = max_task_size
self.gen_task_coef = gen_task_coef
self.dataset_size_path = dataset_size_path
assert strategy in ('heuristic', 'split'), \
f'Unsupported partition strategy: {strategy}. '\
'Supported strategies are: `heuristic`, `split` .'
self.strategy = strategy
def partition(self,
models: List[ConfigDict],
@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
reverse=True)
tasks = []
for model in models:
task = Config({
'models': [model],
'datasets': [[]],
'work_dir': work_dir,
**add_cfg
})
num_data = 0
chunks = [] # elements: tuple(size, dataset_chunk)
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
root, ext = osp.splitext(filename)
# skip the task if the task output exists
if osp.exists(filename):
continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
# skip the task it the task output exists
if not osp.exists(f'{root}_{i}{ext}'):
tasks.append(
Config({
'models': [model],
'datasets': [[dataset_split]],
'work_dir': work_dir,
**add_cfg
}))
chunks.append((self.max_task_size, dataset_split))
else:
if num_data + dataset_size > self.max_task_size:
tasks.append(task)
task = Config({
chunks.append((dataset_size, dataset))
if self.strategy == 'heuristic':
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [[]],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
})
num_data = 0
task['datasets'][0].append(dataset)
num_data = num_data + dataset_size
if task['datasets'][0]:
tasks.append(task)
}))
return tasks
@property

View File

@ -23,7 +23,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
mode: str,
out_dir: str,
model_pairs: Optional[List[Tuple]] = None,
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
assert mode in ['all', 'one_to_n', 'fixed']
self.mode = mode

View File

@ -72,6 +72,7 @@ class DefaultSummarizer:
if not osp.exists(filepath):
continue
result = mmengine.load(filepath)
result.pop('details', None)
raw_results[model_abbr][dataset_abbr] = result
if 'error' in result:
self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')

View File

@ -1,11 +1,14 @@
import argparse
import copy
import fnmatch
import math
import os.path as osp
import statistics
import time
from collections import Counter
from inspect import signature
from shutil import which
from typing import Optional
from typing import List, Optional
import mmengine
from mmengine.config import Config, ConfigDict
@ -35,6 +38,8 @@ class OpenICLEvalTask(BaseTask):
super().__init__(cfg)
self.num_gpus = 0
self.logger = get_logger()
self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
'task', {}).get('dump_details', False)
def get_command(self, cfg_path, template):
script_path = __file__
@ -113,7 +118,7 @@ class OpenICLEvalTask(BaseTask):
[sub_preds[str(i)] for i in range(len(sub_preds))])
filename = root + f'_{i}' + ext
i += 1
pred_dicts = copy.deepcopy(preds)
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
pred_strs = preds.pop('prediction')
@ -163,6 +168,7 @@ class OpenICLEvalTask(BaseTask):
]
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
preds['predictions'] = pred_strs
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
@ -172,18 +178,42 @@ class OpenICLEvalTask(BaseTask):
}
result = icl_evaluator.score(**preds)
if self.dump_details:
try:
details = result.pop('details', None)
result['details'] = self.format_details(
pred_strs, test_set[self.output_column], details,
pred_dicts)
result['type'] = result['details'].pop('type', None)
if 'PPL' in str(
self.dataset_cfg.infer_cfg.inferencer.type):
result['correct_bpb'], result[
'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
else:
result['incorrect_bpb'] = result['correct_bpb'] = -1
except Exception:
result['incorrect_bpb'] = result['correct_bpb'] = -1
else:
result.pop('details', None)
if 'error' in result:
self.logger.error(
f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
return
else:
self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
result_wo_details = {
i: result[i]
for i in result if i != 'details'
}
self.logger.info(
f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
# Save result
out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
osp.join(self.work_dir, 'results'))
mkdir_or_exist(osp.split(out_path)[0])
mmengine.dump(result, out_path)
mmengine.dump(result, out_path, ensure_ascii=False, indent=4)
def _extract_role_pred(self, s: str, begin_str: Optional[str],
end_str: Optional[str]) -> str:
@ -215,6 +245,95 @@ class OpenICLEvalTask(BaseTask):
return s[start:end]
def format_details(self, predictions, references, details, pred_dicts):
"""This function is responsible for formatting prediction details.
Args:
predictions (list): The prediction list.
references (list): The reference list.
details (list): Contains the 'pred' 'answer' and 'correct' for each
sample. Such as `[{'pred': '光荣和ωforce',
'answers': ['光荣和ω-force', '光荣和ωforce'], 'correct': True}]`
pred_dicts (list): Contains a list of samples with the original
prompts. Such as
`[{'origin_prompt': '根据文章回答问题。你的答案应该尽可能3》…………',
'prediction': ' 光荣和ω-force\n', 'gold': ['光荣和ω-force']}]`
Returns:
list: The formatted prediction details.
"""
results = {}
for i in range(len(predictions)):
ppl_flag = False
result = {}
origin_prediction = copy.deepcopy(pred_dicts[i])
origin_prediction.pop('in-context examples', None)
origin_prediction.pop('prediction', None)
keys = copy.deepcopy(list(origin_prediction.keys()))
for key in keys:
if key.startswith('label:'):
ppl_flag = True
origin_prediction[key].pop('testing input', None)
new_key = key.replace('label: ', '')
origin_prediction[new_key] = origin_prediction.pop(key)
if ppl_flag:
results['type'] = 'PPL'
result['origin_prediction'] = origin_prediction
result['predictions'] = str(predictions[i])
result['references'] = str(references[i])
result['correct'] = str(predictions[i]) == str(references[i])
else:
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = details[i]['pred']
result['references'] = details[i]['answers']
result['correct'] = details[i]['correct']
results[str(i)] = result
return results
def calculate_bpb(self, pred_dicts: List):
"""This function is used to calculate the BPB (Bits Per Byte) for the
data. The correct BPB is obtained directly from the values in the
'predictions' file. The incorrect BPB is the average of the remaining
BPB values for each sample under different labels after subtracting the
correct BPB. The calculation of BPB (Bits Per Byte) is similar to PPL,
with the difference that it computes the additional bits needed on
average, in terms of character length, to encode the true sequence
based on the predictions. This calculation involves applying a
weighting factor based on the ratio of words to characters.
Args:
pred_dicts (list): Contains a list of samples with each options
and BPB scores.
Returns:
dict: Contains correct and incorrect bpb.
"""
incorrect_bpb_list = []
bpb_list = []
for pred_dict in pred_dicts:
preds = {
key: value
for key, value in pred_dict.items()
if key.startswith('label: ')
}
values = []
for item in preds.items():
values.append(item[1])
bpbs = [value['BPB'] for value in values]
incorrect_bpb_list.append(
(sum(bpbs) - min(bpbs)) / (len(bpbs) - 1))
bpb_list.append(statistics.mean(bpbs))
def filters(origins):
targets = [target for target in origins if not math.isnan(target)]
return targets
mean_incorrect = statistics.mean(filters(incorrect_bpb_list))
mean_correct = statistics.mean(filters(bpb_list))
return 100 * mean_correct, 100 * mean_incorrect
def parse_args():
parser = argparse.ArgumentParser(description='Score Calculator')

View File

@ -25,6 +25,7 @@ requests==2.31.0
rouge
rouge_chinese
rouge_score
sacrebleu
scikit_learn==1.2.1
seaborn
sentence_transformers==2.2.2

8
run.py
View File

@ -123,6 +123,12 @@ def parse_args():
'Will be overrideen by the "retry" argument in the config.',
type=int,
default=2)
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
)
# set srun args
slurm_parser = parser.add_argument_group('slurm_args')
parse_slurm_args(slurm_parser)
@ -300,6 +306,8 @@ def main():
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
cfg.eval.runner.task.dump_details = True
if args.partition is not None:
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: