mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add P-MMEval (#1714)
* Update with PMMEval * Update * Update __init__.py * Fix Bugs * Delete .pre-commit-config.yaml * Pull merge --------- Co-authored-by: liushz <qq1791167085@163.com>
This commit is contained in:
parent
f7dbe6bb7d
commit
90efcf2216
32
configs/eval_PMMEval.py
Executable file
32
configs/eval_PMMEval.py
Executable file
@ -0,0 +1,32 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models
|
||||
|
||||
# from opencompass.configs.datasets.PMMEval.flores_gen import PMMEval_flores_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.humanevalxl_gen import PMMEval_HumanEvalXL_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.mgsm_gen import PMMEval_MGSM_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.mhellaswag_gen import PMMEval_MHellaswag_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.mifeval_gen import PMMEval_MIFEval_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.mlogiqa_gen import PMMEval_MLogiQA_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.mmmlu_gen import PMMEval_MMMLU_datasets
|
||||
# from opencompass.configs.datasets.PMMEval.xnli import PMMEval_XNLI_datasets
|
||||
|
||||
from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
|
||||
|
||||
from opencompass.configs.summarizers.PMMEval import summarizer
|
||||
|
||||
|
||||
# datasets = PMMEval_flores_datasets
|
||||
# datasets = PMMEval_HumanEvalXL_datasets
|
||||
# datasets = PMMEval_MGSM_datasets
|
||||
# datasets = PMMEval_MHellaswag_datasets
|
||||
# datasets = PMMEval_MIFEval_datasets
|
||||
# datasets = PMMEval_MLogiQA_datasets
|
||||
# datasets = PMMEval_MMMLU_datasets
|
||||
# datasets = PMMEval_XNLI_datasets
|
||||
|
||||
datasets = PMMEval_datasets
|
4
opencompass/configs/datasets/PMMEval/flores_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/flores_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .flores_gen_2697d7 import PMMEval_flores_datasets
|
65
opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py
Executable file
65
opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py
Executable file
@ -0,0 +1,65 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalFloresDataset, PMMEvalFloresEvaluator, pmmeval_flores_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
|
||||
|
||||
PROMPT = {
|
||||
"Chinese": "将这个句子从英语翻译成中文。\n\n{src}",
|
||||
"Arabic": "ترجم هذه الجملة من الإنجليزية إلى العربية.\n\n{src}",
|
||||
"Spanish": "Traduce esta oración del inglés al español.\n\n{src}",
|
||||
"Japanese": "この文を英語から日本語に翻訳してください。\n\n{src}",
|
||||
"Korean": "이 문장을 영어에서 한국어로 번역하세요.\n\n{src}",
|
||||
"Thai": "แปลประโยคนี้จากภาษาอังกฤษเป็นภาษาไทย.\n\n{src}",
|
||||
"French": "Traduisez cette phrase de l'anglais en français.\n\n{src}",
|
||||
"Portuguese": "Traduza esta frase do inglês para o português.\n\n{src}",
|
||||
"Vietnamese": "Dịch câu này từ tiếng Anh sang tiếng Việt.\n\n{src}"
|
||||
}
|
||||
|
||||
PMMEval_flores_datasets = list()
|
||||
|
||||
# Add flores_200
|
||||
|
||||
PMMEval_flores_reader_cfg = dict(
|
||||
input_columns=['src'],
|
||||
output_column='tgt',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
|
||||
PMMEval_flores_datasets = list()
|
||||
|
||||
for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES:
|
||||
PMMEval_flores_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PROMPT[lang_fullname]
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
PMMEval_flores_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalFloresEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_flores_postprocess, lang_fullname=lang_fullname)
|
||||
)
|
||||
|
||||
PMMEval_flores_datasets.append(
|
||||
dict(
|
||||
abbr=f'flores-{lang_fullname}',
|
||||
type=PMMEvalFloresDataset,
|
||||
path='P-MMEval',
|
||||
lang_fullname=lang_fullname,
|
||||
reader_cfg=PMMEval_flores_reader_cfg,
|
||||
infer_cfg=PMMEval_flores_infer_cfg,
|
||||
eval_cfg=PMMEval_flores_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/humanevalxl_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/humanevalxl_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets
|
49
opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py
Executable file
49
opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py
Executable file
@ -0,0 +1,49 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalHumanEvalXLDataset, PMMEvalHumanEvalXLEvaluator
|
||||
|
||||
NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
|
||||
|
||||
PMMEval_HumanEvalXL_datasets = list()
|
||||
|
||||
PMMEval_HumanEvalXL_reader_cfg = dict(
|
||||
input_columns=['task_id', 'prompt', 'entry_point', 'test', 'language', 'description', 'natural_language'],
|
||||
output_column='declaration',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
PMMEval_HumanEvalXL_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='{prompt}'),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
PMMEval_HumanEvalXL_datasets = list()
|
||||
|
||||
for lang_fullname in NATURAL_LANGUAGE_FULLNAMES:
|
||||
for program_lang in ['python', 'java', 'javascript']:
|
||||
|
||||
PMMEval_HumanEvalXL_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=PMMEvalHumanEvalXLEvaluator,
|
||||
language=program_lang,
|
||||
text_language=lang_fullname,
|
||||
ip_address='localhost',
|
||||
port=5001),
|
||||
pred_role='BOT')
|
||||
|
||||
PMMEval_HumanEvalXL_datasets.append(
|
||||
dict(
|
||||
abbr=f'humanevalxl-{program_lang}-{lang_fullname}',
|
||||
type=PMMEvalHumanEvalXLDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_fullname,
|
||||
program_lang=program_lang,
|
||||
reader_cfg=PMMEval_HumanEvalXL_reader_cfg,
|
||||
infer_cfg=PMMEval_HumanEvalXL_infer_cfg,
|
||||
eval_cfg=PMMEval_HumanEvalXL_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/mgsm_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/mgsm_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mgsm_gen_679720 import PMMEval_MGSM_datasets
|
62
opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py
Executable file
62
opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py
Executable file
@ -0,0 +1,62 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalMGSMDataset, PMMEvalMGSMEvaluator
|
||||
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
|
||||
LANG_TO_INSTRUCTIONS = {
|
||||
"en": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"The answer is \". Do not add anything other than the integer answer after \"The answer is \".\n\n{question}",
|
||||
"es": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La respuesta es \". Do not add anything other than the integer answer after \"La respuesta es \".\n\n{question}",
|
||||
"fr": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La réponse est \". Do not add anything other than the integer answer after \"La réponse est \".\n\n{question}",
|
||||
"zh": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答案是 \". Do not add anything other than the integer answer after \"答案是 \".\n\n{question}",
|
||||
"ja": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答えは \". Do not add anything other than the integer answer after \"答えは \".\n\n{question}",
|
||||
"th": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"คำตอบคือ \". Do not add anything other than the integer answer after \"คำตอบคือ \".\n\n{question}",
|
||||
"ko": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"답변은 \". Do not add anything other than the integer answer after \"답변은 \".\n\n{question}",
|
||||
"pt": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"A resposta é \". Do not add anything other than the integer answer after \"A resposta é \".\n\n{question}",
|
||||
"vi": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"Câu trả lời là \". Do not add anything other than the integer answer after \"Câu trả lời là \".\n\n{question}",
|
||||
"ar": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"الجواب هو \". Do not add anything other than the integer answer after \"الجواب هو \".\n\n{question}"
|
||||
}
|
||||
|
||||
PMMEval_MGSM_datasets = list()
|
||||
|
||||
# Add flores_200
|
||||
|
||||
PMMEval_MGSM_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
PMMEval_MGSM_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalMGSMEvaluator),
|
||||
pred_role='BOT')
|
||||
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES:
|
||||
PMMEval_MGSM_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=LANG_TO_INSTRUCTIONS[lang_code]
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
PMMEval_MGSM_datasets.append(
|
||||
dict(
|
||||
abbr=f'mgsm-{lang_code}',
|
||||
type=PMMEvalMGSMDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
reader_cfg=PMMEval_MGSM_reader_cfg,
|
||||
infer_cfg=PMMEval_MGSM_infer_cfg,
|
||||
eval_cfg=PMMEval_MGSM_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/mhellaswag_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/mhellaswag_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets
|
54
opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py
Executable file
54
opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py
Executable file
@ -0,0 +1,54 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalMHellaswagDataset, PMMEvalMHellaswagEvaluator, pmmeval_mhellaswag_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
|
||||
PMMEVAL_MHELLASWAG_TEMPLATE = "Input: {ctx}\nOptions: \nA. {option_1}\nB. {option_2}\nC. {option_3}\nD. {option_4}\nPick the correct ending for the sentence from A, B, C, and D, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C or D."
|
||||
|
||||
PMMEval_MHellaswag_datasets = list()
|
||||
|
||||
PMMEval_MHellaswag_reader_cfg = dict(
|
||||
input_columns=['ctx', 'option_1', 'option_2', 'option_3', 'option_4'],
|
||||
output_column='label',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
PMMEval_MHellaswag_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PMMEVAL_MHELLASWAG_TEMPLATE
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
PMMEval_MHellaswag_datasets = list()
|
||||
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES:
|
||||
PMMEval_MHellaswag_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalMHellaswagEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_mhellaswag_postprocess, lang_code=lang_code)
|
||||
)
|
||||
|
||||
PMMEval_MHellaswag_datasets.append(
|
||||
dict(
|
||||
abbr=f'mhellaswag-{lang_code}',
|
||||
type=PMMEvalMHellaswagDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
reader_cfg=PMMEval_MHellaswag_reader_cfg,
|
||||
infer_cfg=PMMEval_MHellaswag_infer_cfg,
|
||||
eval_cfg=PMMEval_MHellaswag_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/mifeval_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/mifeval_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets
|
51
opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py
Executable file
51
opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py
Executable file
@ -0,0 +1,51 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalMIFEvalDataset, PMMEvalMIFEvalEvaluator, pmmeval_mifeval_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
|
||||
PMMEVAL_MIFEVAL_TEMPLATE = "{prompt}"
|
||||
|
||||
PMMEval_MIFEval_datasets = list()
|
||||
|
||||
PMMEval_MIFEval_reader_cfg = dict(
|
||||
input_columns=['prompt', 'instruction_id_list', 'kwargs'],
|
||||
output_column=None,
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
|
||||
PMMEval_MIFEval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PMMEVAL_MIFEVAL_TEMPLATE
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES:
|
||||
PMMEval_MIFEval_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalMIFEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_mifeval_postprocess, lang_code=lang_code)
|
||||
)
|
||||
|
||||
PMMEval_MIFEval_datasets.append(
|
||||
dict(
|
||||
abbr=f'mifeval-{lang_code}',
|
||||
type=PMMEvalMIFEvalDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
reader_cfg=PMMEval_MIFEval_reader_cfg,
|
||||
infer_cfg=PMMEval_MIFEval_infer_cfg,
|
||||
eval_cfg=PMMEval_MIFEval_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/mlogiqa_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/mlogiqa_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets
|
50
opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py
Executable file
50
opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py
Executable file
@ -0,0 +1,50 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalMLogiQADataset, PMMEvalMLogiQAEvaluator, pmmeval_mlogiqa_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
|
||||
PMMEVAL_MLOGIQA_TEMPLATE = "Passage: {context}\nQuestion: {question}\nChoices:\nA.{option_1}\nB.{option_2}\nC.{option_3}\nD.{option_4}\nPlease choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{'answer': '[choice]'}\nwhere [choice] must be one of A, B, C and D."
|
||||
|
||||
PMMEval_MLogiQA_datasets = []
|
||||
|
||||
|
||||
PMMEval_MLogiQA_reader_cfg = dict(
|
||||
input_columns=['context', 'question', 'option_1', 'option_2', 'option_3', 'option_4'],
|
||||
output_column='answer',
|
||||
train_split='test')
|
||||
|
||||
PMMEval_MLogiQA_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PMMEVAL_MLOGIQA_TEMPLATE
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES:
|
||||
PMMEval_MLogiQA_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalMLogiQAEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_mlogiqa_postprocess, lang_code=lang_code))
|
||||
|
||||
PMMEval_MLogiQA_datasets.append(
|
||||
dict(
|
||||
abbr=f'mlogiqa-{lang_code}',
|
||||
type=PMMEvalMLogiQADataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
reader_cfg=PMMEval_MLogiQA_reader_cfg,
|
||||
infer_cfg=PMMEval_MLogiQA_infer_cfg,
|
||||
eval_cfg=PMMEval_MLogiQA_eval_cfg)
|
||||
)
|
4
opencompass/configs/datasets/PMMEval/mmmlu_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/mmmlu_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets
|
52
opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py
Executable file
52
opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py
Executable file
@ -0,0 +1,52 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalMMMLUDataset, PMMEvalMMMLUEvaluator, pmmeval_mmmlu_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT']
|
||||
|
||||
PMMEVAL_MMMLU_TEMPLATE = "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C and D.\n\n{Question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
|
||||
|
||||
PMMEval_MMMLU_datasets = []
|
||||
|
||||
|
||||
PMMEval_MMMLU_reader_cfg = dict(
|
||||
input_columns=['Question', 'A', 'B', 'C', 'D'],
|
||||
output_column='Answer',
|
||||
train_split='test')
|
||||
|
||||
|
||||
PMMEval_MMMLU_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PMMEVAL_MMMLU_TEMPLATE
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES_MMMLU:
|
||||
PMMEval_MMMLU_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalMMMLUEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_mmmlu_postprocess, lang_code=lang_code))
|
||||
|
||||
PMMEval_MMMLU_datasets.append(
|
||||
dict(
|
||||
abbr=f'mmmlu-{lang_code}',
|
||||
type=PMMEvalMMMLUDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
difficulty='all',
|
||||
reader_cfg=PMMEval_MMMLU_reader_cfg,
|
||||
infer_cfg=PMMEval_MMMLU_infer_cfg,
|
||||
eval_cfg=PMMEval_MMMLU_eval_cfg)
|
||||
)
|
14
opencompass/configs/datasets/PMMEval/pmmeval_gen.py
Executable file
14
opencompass/configs/datasets/PMMEval/pmmeval_gen.py
Executable file
@ -0,0 +1,14 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .flores_gen_2697d7 import PMMEval_flores_datasets
|
||||
from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets
|
||||
from .mgsm_gen_679720 import PMMEval_MGSM_datasets
|
||||
from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets
|
||||
from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets
|
||||
from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets
|
||||
from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets
|
||||
from .xnli_gen_973734 import PMMEval_XNLI_datasets
|
||||
|
||||
|
||||
PMMEval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
4
opencompass/configs/datasets/PMMEval/xnli_gen.py
Executable file
4
opencompass/configs/datasets/PMMEval/xnli_gen.py
Executable file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .xnli_gen_973734 import PMMEval_XNLI_datasets
|
60
opencompass/configs/datasets/PMMEval/xnli_gen_973734.py
Executable file
60
opencompass/configs/datasets/PMMEval/xnli_gen_973734.py
Executable file
@ -0,0 +1,60 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.PMMEval import PMMEvalXNLIDataset, PMMEvalXNLIEvaluator, pmmeval_xnli_postprocess
|
||||
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
|
||||
PMMEVAL_XNLI_TEMPLATE = """Take the following as truth: {premise}
|
||||
Then the following statement: \"{statement}\" is
|
||||
Options:
|
||||
A. true
|
||||
B. inconclusive
|
||||
C. false
|
||||
Select the correct option from A, B, and C, and return it in the following JSON format:
|
||||
{"answer": "[choice]"}
|
||||
where [choice] must be one of A, B, and C."""
|
||||
|
||||
PMMEval_XNLI_datasets = list()
|
||||
|
||||
# Add flores_200
|
||||
|
||||
PMMEval_XNLI_reader_cfg = dict(
|
||||
input_columns=['premise', 'statement'],
|
||||
output_column='answer',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
|
||||
PMMEval_XNLI_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PMMEVAL_XNLI_TEMPLATE
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
for lang_code in NATURAL_LANGUAGE_CODES:
|
||||
PMMEval_XNLI_eval_cfg = dict(
|
||||
evaluator=dict(type=PMMEvalXNLIEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=pmmeval_xnli_postprocess, lang_code=lang_code))
|
||||
|
||||
PMMEval_XNLI_datasets.append(
|
||||
dict(
|
||||
abbr=f'xnli-{lang_code}',
|
||||
type=PMMEvalXNLIDataset,
|
||||
path='P-MMEval',
|
||||
lang=lang_code,
|
||||
reader_cfg=PMMEval_XNLI_reader_cfg,
|
||||
infer_cfg=PMMEval_XNLI_infer_cfg,
|
||||
eval_cfg=PMMEval_XNLI_eval_cfg)
|
||||
)
|
22
opencompass/configs/summarizers/PMMEval.py
Normal file
22
opencompass/configs/summarizers/PMMEval.py
Normal file
@ -0,0 +1,22 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .groups.PMMEval import PMMEval_summary_groups
|
||||
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'flores',
|
||||
'humanevalxl',
|
||||
'mgsm',
|
||||
'mhellaswag',
|
||||
'mifeval',
|
||||
'mlogiqa',
|
||||
'mmmlu',
|
||||
'xnli'
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
||||
),
|
||||
)
|
||||
|
41
opencompass/configs/summarizers/groups/PMMEval.py
Normal file
41
opencompass/configs/summarizers/groups/PMMEval.py
Normal file
@ -0,0 +1,41 @@
|
||||
NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
|
||||
NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
|
||||
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
|
||||
NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT']
|
||||
|
||||
PMMEval_summary_groups = [
|
||||
{
|
||||
'name': 'flores',
|
||||
'subsets': [f'flores-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES]
|
||||
},
|
||||
{
|
||||
'name': 'humanevalxl',
|
||||
'subsets': [f'humanevalxl-python-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \
|
||||
[f'humanevalxl-java-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \
|
||||
[f'humanevalxl-javascript-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES]
|
||||
},
|
||||
{
|
||||
'name': 'mgsm',
|
||||
'subsets': [f'mgsm-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
|
||||
},
|
||||
{
|
||||
'name': 'mhellaswag',
|
||||
'subsets': [f'mhellaswag-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
|
||||
},
|
||||
{
|
||||
'name': 'mifeval',
|
||||
'subsets': [f'mifeval-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
|
||||
},
|
||||
{
|
||||
'name': 'mlogiqa',
|
||||
'subsets': [f'mlogiqa-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
|
||||
},
|
||||
{
|
||||
'name': 'mmmlu',
|
||||
'subsets': [f'mmmlu-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES_MMMLU]
|
||||
},
|
||||
{
|
||||
'name': 'xnli',
|
||||
'subsets': [f'xnli-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
|
||||
}
|
||||
]
|
8
opencompass/datasets/PMMEval/__init__.py
Executable file
8
opencompass/datasets/PMMEval/__init__.py
Executable file
@ -0,0 +1,8 @@
|
||||
from .flores import * # noqa: F401, F403
|
||||
from .humanevalxl import * # noqa: F401, F403
|
||||
from .mgsm import * # noqa: F401, F403
|
||||
from .mhellaswag import * # noqa: F401, F403
|
||||
from .mifeval import * # noqa: F401, F403
|
||||
from .mlogiqa import * # noqa: F401, F403
|
||||
from .mmmlu import * # noqa: F401, F403
|
||||
from .xnli import * # noqa: F401, F403
|
162
opencompass/datasets/PMMEval/flores.py
Executable file
162
opencompass/datasets/PMMEval/flores.py
Executable file
@ -0,0 +1,162 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
from sacrebleu.metrics import BLEU
|
||||
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
|
||||
from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
|
||||
def wmt_postprocess(text: str, lang: str) -> str:
|
||||
text = text.strip()
|
||||
texts = list(x.strip() for x in text.split('\n'))
|
||||
texts = list(x for x in texts if x != '')
|
||||
text = '\n'.join(texts)
|
||||
text = tokenize(text, lang)
|
||||
return text
|
||||
|
||||
|
||||
def compute_maximum_bleu_value(gen: str, ref: str, lang: str):
|
||||
gens = list(x.strip() for x in gen.split('\n'))
|
||||
gens = list(x for x in gens if x != '')
|
||||
|
||||
gens_tokens = list(wmt_postprocess(x, lang) for x in gens)
|
||||
ref_tokens = wmt_postprocess(ref, lang)
|
||||
|
||||
scorer = BLEU(tokenize='13a', effective_order=True)
|
||||
|
||||
maximum_bleu_value = -100.0
|
||||
maximum_bleu_object = None
|
||||
|
||||
for i in range(0, len(gens_tokens)):
|
||||
for j in range(i, len(gens_tokens)):
|
||||
gens_tokens_region = ' '.join(gens_tokens[i:j + 1])
|
||||
sentence_bleu = scorer.sentence_score(gens_tokens_region,
|
||||
[ref_tokens])
|
||||
|
||||
if sentence_bleu.score > maximum_bleu_value:
|
||||
maximum_bleu_value = sentence_bleu.score
|
||||
maximum_bleu_object = sentence_bleu
|
||||
|
||||
if maximum_bleu_object is None:
|
||||
sentence_bleu = scorer.sentence_score('', [ref_tokens])
|
||||
return sentence_bleu
|
||||
else:
|
||||
return maximum_bleu_object
|
||||
|
||||
|
||||
def trim_multiple_space(tokes):
|
||||
return ''.join(tokes).strip().split()
|
||||
|
||||
|
||||
class SpaceTokenizer(object):
|
||||
|
||||
def __call__(self, sent):
|
||||
if type(sent) == list:
|
||||
print(sent)
|
||||
raise ValueError()
|
||||
return ' '.join(sent.strip().split())
|
||||
|
||||
|
||||
class NonASCIITokenizer(object):
|
||||
|
||||
def __init__(self):
|
||||
self.is_cjk = re.compile('([\u2e80-\u9fff]|' # 中日韩
|
||||
'[\ua960-\ua97f]|' # 谚文字母扩展A
|
||||
'[\uac00-\ud7ff]|' # 谚文音节+谚文字母扩展B
|
||||
'[\u0E00-\u0E7F]' # 泰文
|
||||
')')
|
||||
|
||||
def __call__(self, sent):
|
||||
sent = sent.strip()
|
||||
chs = list(sent)
|
||||
line_chtok = []
|
||||
for ch in chs:
|
||||
if self.is_cjk.match(ch):
|
||||
line_chtok.append(' ')
|
||||
line_chtok.append(ch)
|
||||
line_chtok.append(' ')
|
||||
else:
|
||||
line_chtok.append(ch)
|
||||
line_chtok = trim_multiple_space(line_chtok)
|
||||
return ' '.join(line_chtok)
|
||||
|
||||
|
||||
def build_tokenizer(lang: str):
|
||||
if lang == 'Chinese':
|
||||
return TokenizerZh()
|
||||
elif lang in {'Japanese', 'Korean', 'Thai'}:
|
||||
return NonASCIITokenizer()
|
||||
else:
|
||||
return SpaceTokenizer()
|
||||
|
||||
|
||||
def tokenize(sent, lang):
|
||||
tokenizer = build_tokenizer(lang)
|
||||
final_tokenizer = Tokenizer13a()
|
||||
return final_tokenizer(tokenizer(sent))
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_flores')
|
||||
def pmmeval_flores_postprocess(text: str, lang_fullname: str) -> Tuple[str]:
|
||||
return text, lang_fullname
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalFloresDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang_fullname: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='flores',
|
||||
split=f'test/{lang_fullname}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path,
|
||||
f'flores/test/{lang_fullname}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalFloresEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
maximum_bleu_results = list()
|
||||
for (pred, tgt_lang), ref in zip(predictions, references):
|
||||
maximum_bleu_results.append(
|
||||
compute_maximum_bleu_value(pred, ref, tgt_lang))
|
||||
|
||||
maximum_corpus_bleu_counts = sum(
|
||||
np.array(x.counts) for x in maximum_bleu_results).tolist()
|
||||
maximum_corpus_bleu_totals = sum(
|
||||
np.array(x.totals) for x in maximum_bleu_results).tolist()
|
||||
maximum_corpus_bleu_sys_len = sum(x.sys_len
|
||||
for x in maximum_bleu_results)
|
||||
maximum_corpus_bleu_ref_len = sum(x.ref_len
|
||||
for x in maximum_bleu_results)
|
||||
|
||||
maximum_bleu_result = BLEU.compute_bleu(
|
||||
correct=maximum_corpus_bleu_counts,
|
||||
total=maximum_corpus_bleu_totals,
|
||||
sys_len=maximum_corpus_bleu_sys_len,
|
||||
ref_len=maximum_corpus_bleu_ref_len)
|
||||
|
||||
result = {'BLEU': round(maximum_bleu_result.score, 2)}
|
||||
return result
|
226
opencompass/datasets/PMMEval/humanevalxl.py
Executable file
226
opencompass/datasets/PMMEval/humanevalxl.py
Executable file
@ -0,0 +1,226 @@
|
||||
import json
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from shutil import copyfile
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.datasets.humaneval import humaneval_postprocess_v2
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
_LANGUAGE_NAME_DICT = {
|
||||
'java': 'Java',
|
||||
'javascript': 'JavaScript',
|
||||
'js': 'JavaScript',
|
||||
'python': 'Python',
|
||||
}
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalHumanEvalXLDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str, program_lang: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='humaneval-xl',
|
||||
split=f'test/{program_lang}/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(
|
||||
data_path, f'humaneval-xl/test/{program_lang}/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalHumanEvalXLEvaluator(BaseEvaluator):
|
||||
|
||||
def __init__(self,
|
||||
language,
|
||||
ip_address='localhost',
|
||||
text_language='',
|
||||
port='',
|
||||
retry=2,
|
||||
timeout=600) -> None:
|
||||
assert language in _LANGUAGE_NAME_DICT.keys(), (
|
||||
f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}')
|
||||
if language == 'rust':
|
||||
timeout *= 10 # rust need more time
|
||||
self.language = language
|
||||
self.text_language = text_language
|
||||
self.ip_address = ip_address
|
||||
self.port = port
|
||||
self.retry = retry
|
||||
self.timeout = timeout
|
||||
super().__init__()
|
||||
|
||||
def score(self, predictions, references):
|
||||
predictions = [{
|
||||
'task_id':
|
||||
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
|
||||
'generation':
|
||||
_clean_up_code(pred, self.language, refer),
|
||||
} for i, (pred, refer) in enumerate(zip(predictions, references))]
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_out_path = osp.join(
|
||||
tmp_dir,
|
||||
f'humanevalx_{self.language}_{self.text_language}.json')
|
||||
with open(tmp_out_path, 'w') as f:
|
||||
for pred in predictions:
|
||||
f.write(json.dumps(pred) + '\n')
|
||||
|
||||
num_retry = 0
|
||||
while num_retry < self.retry:
|
||||
succeed, output = self._code_eval_service(
|
||||
file_path=tmp_out_path)
|
||||
if not succeed and '(56) Recv failure' in output:
|
||||
# only retry when connection failed
|
||||
num_retry += 1
|
||||
# wait a min in case the service load is too high
|
||||
time.sleep(60)
|
||||
else:
|
||||
break
|
||||
|
||||
if succeed:
|
||||
if isinstance(output, str):
|
||||
return json.loads(output)
|
||||
elif isinstance(output, dict):
|
||||
return output
|
||||
|
||||
ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html' # noqa
|
||||
if hasattr(self, '_out_dir'):
|
||||
result_file_path = re.sub('results', 'mid_results',
|
||||
self._out_dir) + '.json' # noqa
|
||||
if not osp.exists(osp.dirname(result_file_path)):
|
||||
os.makedirs(osp.dirname(result_file_path))
|
||||
else:
|
||||
result_file_path = os.path.join(
|
||||
'outputs', f'humanevalx_{self.language}.json')
|
||||
copyfile(tmp_out_path, result_file_path)
|
||||
raise Exception(
|
||||
f'Call CodeEvalService Error in `HumanevalXEvaluator`, The '
|
||||
f"results have been saved in path '{result_file_path}', You "
|
||||
'need to check that your code evaluate service is launched and'
|
||||
f' the network to service is connected, you can also get '
|
||||
f'results directly by using `curl` command refer to {ref_url}.'
|
||||
f'\nError Information: {output}')
|
||||
|
||||
def _code_eval_service(self, file_path):
|
||||
if self.port:
|
||||
eval_server_url = f'{self.ip_address}:{self.port}/evaluate'
|
||||
else:
|
||||
eval_server_url = f'{self.ip_address}/evaluate'
|
||||
exec_result = subprocess.run([
|
||||
'curl', '-X', 'POST', '-F', f'file=@{file_path}', '-F',
|
||||
f'dataset=humanevalx/{self.language}', f'{eval_server_url}'
|
||||
],
|
||||
timeout=self.timeout,
|
||||
capture_output=True)
|
||||
if exec_result.returncode == 0 and re.match(
|
||||
"\"{.*:.*}\"", exec_result.stdout.decode('utf-8')):
|
||||
return True, json.loads(exec_result.stdout.decode('utf-8'))
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except Exception:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except Exception:
|
||||
err = exec_result.stdout
|
||||
return False, err
|
||||
|
||||
|
||||
def _clean_up_code(text: str, language_type: str, reference) -> str:
|
||||
"""Cleans up the generated code."""
|
||||
try:
|
||||
# for chatGLM related text
|
||||
eval_text = eval(text)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if isinstance(eval_text, str):
|
||||
text = eval_text
|
||||
# extract code from code block
|
||||
text = text.lstrip('\n')
|
||||
if '```' in text:
|
||||
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
||||
if len(blocks) == 0:
|
||||
text = text.split('```')[1] # fall back to default strategy
|
||||
else:
|
||||
text = blocks[0] # fetch the first code block
|
||||
if not text.startswith('\n'): # in case starting with ```xxx
|
||||
text = text[max(text.find('\n') + 1, 0):]
|
||||
if language_type.lower() == 'python':
|
||||
text = humaneval_postprocess_v2(text)
|
||||
# we need to take care of the first line
|
||||
# append extra space for first line for correct indentation
|
||||
text = ' ' + text.lstrip()
|
||||
|
||||
text_splits = text.split('\n')
|
||||
is_empty_line = False
|
||||
ind_empty_line = None
|
||||
for i, line in enumerate(text_splits):
|
||||
if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
|
||||
is_empty_line = True
|
||||
ind_empty_line = i
|
||||
break
|
||||
if is_empty_line:
|
||||
text = '\n'.join(text_splits[:ind_empty_line])
|
||||
else:
|
||||
end_words = [
|
||||
'\ndef', '\nclass', '\n#', '\nassert', '\n"""', '\nprint',
|
||||
'\nif', '\n\n\n'
|
||||
]
|
||||
for w in end_words:
|
||||
if w in text:
|
||||
text = text[:text.rfind(w)]
|
||||
# strip function head for all other language
|
||||
func_name = reference.strip().split('\n')[-1]
|
||||
if func_name:
|
||||
func_name = func_name.strip().strip('{')
|
||||
if func_name in text:
|
||||
text = '\n'.join(text[text.find(func_name):].split('\n')[1:])
|
||||
if language_type.lower() == 'java':
|
||||
main_pos = text.find('public static void main')
|
||||
if main_pos != -1:
|
||||
text = text[:main_pos] + '}'
|
||||
if '}' in text:
|
||||
text = text[:text.rfind('}')] + '}'
|
||||
if text.count('{') + 1 == text.count('}'):
|
||||
text += '\n}'
|
||||
elif language_type.lower() == 'go':
|
||||
if '\nfunc main(' in text:
|
||||
text = text[:text.rfind('func main(')]
|
||||
if '}' in text:
|
||||
text = text[:text.rfind('}')] + '}'
|
||||
elif language_type.lower() == 'cpp':
|
||||
if '\nint main()' in text:
|
||||
text = text[:text.rfind('int main()')]
|
||||
if '}' in text:
|
||||
text = text[:text.rfind('}')] + '}'
|
||||
elif language_type.lower() == 'js':
|
||||
if '}' in text:
|
||||
text = text[:text.rfind('}')] + '}'
|
||||
elif language_type.lower() == 'rust':
|
||||
if '}' in text:
|
||||
text = text[:text.rfind('}')] + '}'
|
||||
|
||||
return text
|
79
opencompass/datasets/PMMEval/mgsm.py
Executable file
79
opencompass/datasets/PMMEval/mgsm.py
Executable file
@ -0,0 +1,79 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
|
||||
def _get_last_digit(s):
|
||||
_PAT_LAST_DIGIT = re.compile(
|
||||
r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)' # noqa E501
|
||||
)
|
||||
match = list(_PAT_LAST_DIGIT.finditer(s))
|
||||
if match:
|
||||
last_digit = match[-1].group().replace(',', '').replace(
|
||||
'+', '').strip().strip('.')
|
||||
# print(f"The last digit in {s} is {last_digit}")
|
||||
else:
|
||||
last_digit = None
|
||||
# logger.warning(f"No digits found in {s!r}")
|
||||
return last_digit
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalMGSMDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mgsm',
|
||||
split=f'test/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path, f'mgsm/test/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalMGSMEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
assert len(predictions) == len(references)
|
||||
|
||||
num_correct, total = 0, 0
|
||||
details = {}
|
||||
for index, (references_answer, predictions_answer) in enumerate(
|
||||
zip(references, predictions)):
|
||||
extracted_answer = _get_last_digit(predictions_answer)
|
||||
references_answer = references_answer.replace(',', '')
|
||||
if references_answer == extracted_answer:
|
||||
is_correct = True
|
||||
else:
|
||||
is_correct = False
|
||||
|
||||
num_correct += is_correct
|
||||
total += 1
|
||||
details[str(index)] = {
|
||||
'references': references_answer,
|
||||
'predictions': predictions_answer,
|
||||
'extracted': extracted_answer,
|
||||
'correct': is_correct,
|
||||
}
|
||||
|
||||
accuracy = round(num_correct / total * 100, 2)
|
||||
final_result = {'accuracy': accuracy, 'details': details}
|
||||
return final_result
|
151
opencompass/datasets/PMMEval/mhellaswag.py
Executable file
151
opencompass/datasets/PMMEval/mhellaswag.py
Executable file
@ -0,0 +1,151 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
langs_dict = {
|
||||
'fr': ['La réponse est', 'la réponse est'],
|
||||
'en': ['the answer is', 'The answer is'],
|
||||
'vi': ['Câu trả lời là', 'câu trả lời là'],
|
||||
'ar': ['الجواب هو'],
|
||||
'th': ['คำตอบคือ'],
|
||||
'zh': ['答案是'],
|
||||
'ko': ['답변은'],
|
||||
'pt': ['A resposta é'],
|
||||
'ja': ['答えは'],
|
||||
'es': ['La respuesta es']
|
||||
}
|
||||
|
||||
|
||||
def extract_choice(gen, lang):
|
||||
r"""
|
||||
{
|
||||
"answer": "A|B|C|D"
|
||||
}
|
||||
"""
|
||||
patterns = [
|
||||
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
|
||||
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
|
||||
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
|
||||
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
|
||||
]
|
||||
for pattern in patterns:
|
||||
res = re.findall(pattern, gen, flags=re.DOTALL)
|
||||
if len(res) >= 1:
|
||||
return res[-1]
|
||||
|
||||
else:
|
||||
res = None
|
||||
pattern = langs_dict[lang]
|
||||
for p in pattern:
|
||||
if p in gen and p != gen:
|
||||
res = gen.split(p)
|
||||
if len(res) > 1 and len(res[-1].strip()) > 0:
|
||||
res = res[-1].strip()[0]
|
||||
else:
|
||||
res = None
|
||||
break
|
||||
|
||||
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
|
||||
if res in temp:
|
||||
return res
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract_choice_fuzzy(gen, lang):
|
||||
options = ['A', 'B', 'C', 'D'] # 定义选项
|
||||
for option in options:
|
||||
if option in gen: # 检查选项是否在文本中
|
||||
return option # 返回第一个出现的选项
|
||||
return None
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_mhellaswag')
|
||||
def pmmeval_mhellaswag_postprocess(text: str, lang_code: str) -> Tuple[str]:
|
||||
return text, lang_code
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalMHellaswagDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mhellaswag',
|
||||
split=f'test/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path, f'mhellaswag/test/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalMHellaswagEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
assert len(predictions) == len(references)
|
||||
|
||||
all_results = list()
|
||||
|
||||
for (pred, lang), ref in zip(predictions, references):
|
||||
answer = chr(int(ref) + 65)
|
||||
choice = extract_choice(pred, lang)
|
||||
acc = 0
|
||||
failed_strict = 0
|
||||
failed = 1
|
||||
if choice is not None:
|
||||
failed = 0
|
||||
if answer.lower() == choice.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
else:
|
||||
choice = extract_choice_fuzzy(pred, lang)
|
||||
if choice is None:
|
||||
acc = 0
|
||||
failed_strict = 1
|
||||
else:
|
||||
failed_strict = 0
|
||||
if answer.lower() == choice.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
|
||||
all_results.append({
|
||||
'acc':
|
||||
float(acc),
|
||||
'failed':
|
||||
float(failed),
|
||||
'failed_strict':
|
||||
float(failed_strict),
|
||||
'extracted_answer':
|
||||
pred if pred else 'no answer',
|
||||
})
|
||||
|
||||
final_result = {
|
||||
'accuracy':
|
||||
round(
|
||||
sum(x['acc'] for x in all_results) / len(all_results) * 100,
|
||||
2),
|
||||
'details':
|
||||
all_results
|
||||
}
|
||||
|
||||
return final_result
|
147
opencompass/datasets/PMMEval/mifeval.py
Executable file
147
opencompass/datasets/PMMEval/mifeval.py
Executable file
@ -0,0 +1,147 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.datasets.PMMEval.mifeval_utils import mifeval_class_map
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
|
||||
def test_instruction_following_strict(inp, response, lang_code):
|
||||
"""Tests response to see if instrutions are followed."""
|
||||
instruction_list = inp['instruction_id_list']
|
||||
is_following_list = []
|
||||
|
||||
for index, instruction_id in enumerate(instruction_list):
|
||||
instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':'))
|
||||
instruction_fuction_info = mifeval_class_map[instruction_id_0][
|
||||
instruction_id_1]
|
||||
|
||||
instruction_function = instruction_fuction_info['function']
|
||||
instruction_function_args = dict()
|
||||
|
||||
if instruction_fuction_info['required_lang_code']:
|
||||
instruction_function_args['lang_code'] = lang_code
|
||||
for kwarg_dict in inp['kwargs']:
|
||||
for k, v in kwarg_dict.items():
|
||||
if v is None:
|
||||
continue
|
||||
instruction_function_args[k] = v
|
||||
instruction_function_args['input_string'] = response
|
||||
|
||||
if response.strip() and instruction_function(
|
||||
**instruction_function_args):
|
||||
is_following_list.append(True)
|
||||
else:
|
||||
is_following_list.append(False)
|
||||
|
||||
return 1.0 if all(is_following_list) else 0.0
|
||||
|
||||
|
||||
def test_instruction_following_loose(inp, response, lang_code):
|
||||
"""Tests response for an upper bound for following instructions."""
|
||||
r = response.split('\n')
|
||||
response_remove_first = '\n'.join(r[1:]).strip()
|
||||
response_remove_last = '\n'.join(r[:-1]).strip()
|
||||
response_remove_both = '\n'.join(r[1:-1]).strip()
|
||||
revised_response = response.replace('*', '')
|
||||
revised_response_remove_first = response_remove_first.replace('*', '')
|
||||
revised_response_remove_last = response_remove_last.replace('*', '')
|
||||
revised_response_remove_both = response_remove_both.replace('*', '')
|
||||
all_responses = [
|
||||
response,
|
||||
revised_response,
|
||||
response_remove_first,
|
||||
response_remove_last,
|
||||
response_remove_both,
|
||||
revised_response_remove_first,
|
||||
revised_response_remove_last,
|
||||
revised_response_remove_both,
|
||||
]
|
||||
instruction_list = inp['instruction_id_list']
|
||||
is_following_list = []
|
||||
|
||||
for index, instruction_id in enumerate(instruction_list):
|
||||
instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':'))
|
||||
instruction_fuction_info = mifeval_class_map[instruction_id_0][
|
||||
instruction_id_1]
|
||||
|
||||
instruction_function = instruction_fuction_info['function']
|
||||
instruction_function_args = dict()
|
||||
|
||||
if instruction_fuction_info['required_lang_code']:
|
||||
instruction_function_args['lang_code'] = lang_code
|
||||
for kwarg_dict in inp['kwargs']:
|
||||
for k, v in kwarg_dict.items():
|
||||
instruction_function_args[k] = v
|
||||
instruction_function_args['input_string'] = response
|
||||
|
||||
is_following = False
|
||||
for r in all_responses:
|
||||
if r.strip() and instruction_function(**instruction_function_args):
|
||||
is_following = True
|
||||
break
|
||||
|
||||
is_following_list.append(is_following)
|
||||
|
||||
return 1.0 if all(is_following_list) else 0.0
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_mifeval')
|
||||
def pmmeval_mifeval_postprocess(text: str, lang_code: str) -> Tuple[str]:
|
||||
return text, lang_code
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalMIFEvalDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mifeval',
|
||||
split=f'test/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path, f'mifeval/test/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalMIFEvalEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references, test_set):
|
||||
all_results = list()
|
||||
for (pred, lang), example in zip(predictions, test_set):
|
||||
temp_result = {
|
||||
'strict_acc':
|
||||
test_instruction_following_strict(example, pred, lang),
|
||||
'loose_acc':
|
||||
test_instruction_following_loose(example, pred, lang)
|
||||
}
|
||||
|
||||
all_results.append(temp_result)
|
||||
|
||||
result = {
|
||||
'strict_acc':
|
||||
round(
|
||||
sum(x['strict_acc']
|
||||
for x in all_results) / len(all_results) * 100, 2),
|
||||
'loose_acc':
|
||||
round(
|
||||
sum(x['loose_acc']
|
||||
for x in all_results) / len(all_results) * 100, 2)
|
||||
}
|
||||
return result
|
17
opencompass/datasets/PMMEval/mifeval_utils/__init__.py
Executable file
17
opencompass/datasets/PMMEval/mifeval_utils/__init__.py
Executable file
@ -0,0 +1,17 @@
|
||||
from .combination_checker import combination_checker
|
||||
from .detectable_content_checker import detectable_content_checker
|
||||
from .detectable_format_checker import detectable_format_checker
|
||||
from .keywords_checker import keywords_checker
|
||||
from .length_constraints_checker import length_constraints_checker
|
||||
from .punctuation_checker import punctuation_checker
|
||||
from .startend_checker import startend_checker
|
||||
|
||||
mifeval_class_map = {
|
||||
'combination': combination_checker,
|
||||
'detectable_content': detectable_content_checker,
|
||||
'detectable_format': detectable_format_checker,
|
||||
'keywords': keywords_checker,
|
||||
'length_constraints': length_constraints_checker,
|
||||
'punctuation': punctuation_checker,
|
||||
'startend': startend_checker
|
||||
}
|
32
opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py
Executable file
32
opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py
Executable file
@ -0,0 +1,32 @@
|
||||
def repeat_prompt_checker(input_string: str, prompt_to_repeat: str, **kwargs):
|
||||
if input_string.strip().lower().startswith(
|
||||
prompt_to_repeat.strip().lower()):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def two_responses_checker(input_string: str, **kwargs):
|
||||
valid_responses = list()
|
||||
responses = input_string.split('******')
|
||||
for index, response in enumerate(responses):
|
||||
if not response.strip():
|
||||
if index != 0 and index != len(responses) - 1:
|
||||
return False
|
||||
else:
|
||||
valid_responses.append(response)
|
||||
return (len(valid_responses) == 2
|
||||
and valid_responses[0].strip() != valid_responses[1].strip())
|
||||
|
||||
|
||||
combination_checker = {
|
||||
'repeat_prompt': {
|
||||
'function': repeat_prompt_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'two_responses': {
|
||||
'function': two_responses_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 1
|
||||
}
|
||||
}
|
30
opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py
Executable file
30
opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py
Executable file
@ -0,0 +1,30 @@
|
||||
import re
|
||||
|
||||
|
||||
def number_placeholders_checker(input_string: str, num_placeholders: int,
|
||||
**kwargs):
|
||||
placeholders = re.findall(r'\[.*?\]', input_string)
|
||||
return len(placeholders) >= num_placeholders
|
||||
|
||||
|
||||
def postscript_checker(input_string: str, postscript_marker: str, **kwargs):
|
||||
input_string = input_string.lower()
|
||||
postscript_pattern = r'\s*' + postscript_marker.lower() + r'.*$'
|
||||
postscript = re.findall(postscript_pattern,
|
||||
input_string,
|
||||
flags=re.MULTILINE)
|
||||
return True if postscript else False
|
||||
|
||||
|
||||
detectable_content_checker = {
|
||||
'number_placeholders': {
|
||||
'function': number_placeholders_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'postscript': {
|
||||
'function': postscript_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
}
|
||||
}
|
122
opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py
Executable file
122
opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py
Executable file
@ -0,0 +1,122 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def removeprefix(s, prefix):
|
||||
if s.startswith(prefix):
|
||||
return s[len(prefix):]
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def removesuffix(s, suffix):
|
||||
if s.endswith(suffix):
|
||||
return s[:-len(suffix)]
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
constrained_response = {
|
||||
'ar': ['إجابتي هي نعم.', 'إجابتي هي لا.', 'إجابتي هي ربما.'],
|
||||
'es':
|
||||
['Mi respuesta es sí.', 'Mi respuesta es no.', 'Mi respuesta es tal vez.'],
|
||||
'fr': [
|
||||
'Ma réponse est oui.', 'Ma réponse est non.',
|
||||
'Ma réponse est peut-être.'
|
||||
],
|
||||
'ja': ['私の答えははいです。', '私の答えはいいえです。', '私の答えはたぶんです。'],
|
||||
'ko': ['제 대답은 예입니다.', '제 대답은 아닙니다.', '제 대답은 아마도입니다.'],
|
||||
'pt': [
|
||||
'Minha resposta é sim.', 'Minha resposta é não.',
|
||||
'Minha resposta é talvez.'
|
||||
],
|
||||
'th': ['คำตอบของฉันคือใช่', 'คำตอบของฉันคือไม่', 'คำตอบของฉันคืออาจจะ'],
|
||||
'vi': [
|
||||
'Câu trả lời của tôi là có.', 'Câu trả lời của tôi là không.',
|
||||
'Câu trả lời của tôi là có thể.'
|
||||
],
|
||||
'en': ['My answer is yes.', 'My answer is no.', 'My answer is maybe.'],
|
||||
'zh': ['我的答案是是。', '我的答案是否。', '我的答案是不确定。']
|
||||
}
|
||||
|
||||
|
||||
def constrained_response_checker(input_string: str, lang_code: str, **kwargs):
|
||||
allowable_responses = constrained_response[lang_code]
|
||||
return any(response in input_string for response in allowable_responses)
|
||||
|
||||
|
||||
def number_bullet_lists_checker(input_string: str, num_bullets: int, **kwargs):
|
||||
bullet_lists = re.findall(r'^\s*\*[^\*].*$',
|
||||
input_string,
|
||||
flags=re.MULTILINE)
|
||||
bullet_lists_2 = re.findall(r'^\s*-.*$', input_string, flags=re.MULTILINE)
|
||||
num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
|
||||
return num_bullet_lists == num_bullets
|
||||
|
||||
|
||||
def number_highlighted_sections_checker(input_string: str, num_highlights: int,
|
||||
**kwargs):
|
||||
temp_num_highlights = 0
|
||||
highlights = re.findall(r'\*[^\n\*]*\*', input_string)
|
||||
double_highlights = re.findall(r'\*\*[^\n\*]*\*\*', input_string)
|
||||
for highlight in highlights:
|
||||
if highlight.strip('*').strip():
|
||||
temp_num_highlights += 1
|
||||
for highlight in double_highlights:
|
||||
if removesuffix(removeprefix(highlight, '**'), '**').strip():
|
||||
temp_num_highlights += 1
|
||||
|
||||
return temp_num_highlights >= num_highlights
|
||||
|
||||
|
||||
def title_checker(input_string: str, **kwargs):
|
||||
pattern = r'<<[^\n]+>>'
|
||||
re_pattern = re.compile(pattern)
|
||||
titles = re.findall(re_pattern, input_string)
|
||||
|
||||
for title in titles:
|
||||
if title.lstrip('<').rstrip('>').strip():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def json_format_checker(input_string: str, **kwargs):
|
||||
value = (removesuffix(
|
||||
removeprefix(
|
||||
removeprefix(
|
||||
removeprefix(removeprefix(input_string.strip(), '```json'),
|
||||
'```Json'), '```JSON'), '```'), '```').strip())
|
||||
try:
|
||||
json.loads(value)
|
||||
except ValueError as e: # noqa F841
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
detectable_format_checker = {
|
||||
'constrained_response': {
|
||||
'function': constrained_response_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'json_format': {
|
||||
'function': json_format_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 1
|
||||
},
|
||||
'number_bullet_lists': {
|
||||
'function': number_bullet_lists_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_parmas': 2
|
||||
},
|
||||
'number_highlighted_sections': {
|
||||
'function': number_highlighted_sections_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'title': {
|
||||
'function': title_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 1
|
||||
}
|
||||
}
|
12
opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py
Executable file
12
opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py
Executable file
@ -0,0 +1,12 @@
|
||||
def forbidden_words_checker(input_string: str, forbidden_words: list,
|
||||
**kwargs):
|
||||
return not any(word in input_string for word in forbidden_words)
|
||||
|
||||
|
||||
keywords_checker = {
|
||||
'forbidden_words': {
|
||||
'function': forbidden_words_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
}
|
93
opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py
Executable file
93
opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py
Executable file
@ -0,0 +1,93 @@
|
||||
import re
|
||||
|
||||
|
||||
def nth_paragraph_first_word_checker(input_string: str, num_paragraphs: int,
|
||||
nth_paragraph: int, first_word: str,
|
||||
lang_code: str, **kwargs):
|
||||
paragraphs = re.split(r'\n\n', input_string)
|
||||
paragraphs = list(paragraph.strip() for paragraph in paragraphs
|
||||
if paragraph.strip() != '')
|
||||
|
||||
if len(paragraphs) < num_paragraphs:
|
||||
return False
|
||||
|
||||
if len(paragraphs) < nth_paragraph:
|
||||
return False
|
||||
|
||||
paragraph = paragraphs[nth_paragraph - 1].strip()
|
||||
|
||||
first_word = ''
|
||||
|
||||
if paragraph.lower().startswith(first_word.lower()):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def number_paragraphs_checker(input_string: str, num_paragraphs: int,
|
||||
**kwargs):
|
||||
paragraphs = re.split(r'\s?\*\*\*\s?', input_string)
|
||||
paragraphs = list(paragraph.strip() for paragraph in paragraphs
|
||||
if paragraph.strip() != '')
|
||||
return len(paragraphs) == num_paragraphs
|
||||
|
||||
|
||||
def number_sentences_checker(input_string: str, relation: str,
|
||||
num_sentences: int, lang_code: str, **kwargs):
|
||||
sentences = list(x.strip() for x in input_string.strip().split('\n'))
|
||||
sentences = list(x for x in sentences if x != '')
|
||||
|
||||
if relation == 'less than':
|
||||
if len(sentences) <= num_sentences:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif relation == 'at least':
|
||||
if len(sentences) >= num_sentences:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def number_words_checker(input_string: str, relation: str, num_words: int,
|
||||
lang_code: str, **kwargs):
|
||||
if lang_code in ['en', 'es', 'fr', 'in', 'pt', 'ru', 'vi']:
|
||||
words = input_string.split()
|
||||
words = list(x for x in words if x != '')
|
||||
else:
|
||||
words = ''.join(input_string.split())
|
||||
|
||||
if relation == 'less than':
|
||||
if len(words) <= num_words:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif relation == 'at least':
|
||||
if len(words) >= num_words:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
length_constraints_checker = {
|
||||
'nth_paragraph_first_word': {
|
||||
'function': nth_paragraph_first_word_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 5
|
||||
},
|
||||
'number_paragraphs': {
|
||||
'function': number_paragraphs_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'number_sentences': {
|
||||
'function': number_sentences_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 3
|
||||
},
|
||||
'number_words': {
|
||||
'function': number_words_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 4
|
||||
}
|
||||
}
|
30
opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py
Executable file
30
opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py
Executable file
@ -0,0 +1,30 @@
|
||||
import re
|
||||
|
||||
comma_unicode = {
|
||||
'ar': re.compile(r'[\u060C]'),
|
||||
'es': re.compile(r'[,\uFF0C]'),
|
||||
'fr': re.compile(r'[,\u2026]'),
|
||||
'ja': re.compile(r'[,\u3001]'),
|
||||
'ko': re.compile(r'[,]'),
|
||||
'pt': re.compile(r'[,\uFF0C]'),
|
||||
'th': re.compile(r'[\u0E25]'),
|
||||
'vi': re.compile(r'[,\uFF0C]'),
|
||||
'en': re.compile(r'[,]'),
|
||||
'zh': re.compile(r'[,,]')
|
||||
}
|
||||
|
||||
|
||||
def no_comma_checker(input_string: str, lang_code: str, **kwargs):
|
||||
if len(comma_unicode[lang_code].findall(input_string)) > 0:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
punctuation_checker = {
|
||||
'no_comma': {
|
||||
'function': no_comma_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 2
|
||||
}
|
||||
}
|
38
opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py
Executable file
38
opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py
Executable file
@ -0,0 +1,38 @@
|
||||
def end_checker_checker(input_string: str, end_phrase: str, **kwargs):
|
||||
if input_string.strip().endswith(end_phrase):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def quotation_checker(input_string: str, lang_code: str, **kwargs):
|
||||
input_string = input_string.strip()
|
||||
if input_string.startswith('"') and input_string.endswith('"'):
|
||||
return True
|
||||
elif lang_code in [
|
||||
'ar', 'es', 'fr', 'pt', 'ru'
|
||||
] and input_string.startswith('«') and input_string.endswith('»'):
|
||||
return True
|
||||
elif lang_code in [
|
||||
'ar', 'es', 'fr', 'ko', 'pt', 'th', 'vi', 'zh'
|
||||
] and input_string.startswith('“') and input_string.endswith('”'):
|
||||
return True
|
||||
elif lang_code == 'ja' and input_string.startswith(
|
||||
'『') and input_string.endswith('』'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
startend_checker = {
|
||||
'end_checker': {
|
||||
'function': end_checker_checker,
|
||||
'required_lang_code': False,
|
||||
'num_of_params': 2
|
||||
},
|
||||
'quotation': {
|
||||
'function': quotation_checker,
|
||||
'required_lang_code': True,
|
||||
'num_of_params': 2
|
||||
}
|
||||
}
|
152
opencompass/datasets/PMMEval/mlogiqa.py
Executable file
152
opencompass/datasets/PMMEval/mlogiqa.py
Executable file
@ -0,0 +1,152 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
langs_dict = {
|
||||
'fr': ['La réponse est', 'la réponse est'],
|
||||
'en': ['the answer is', 'The answer is'],
|
||||
'vi': ['Câu trả lời là', 'câu trả lời là'],
|
||||
'ar': ['الجواب هو'],
|
||||
'th': ['คำตอบคือ'],
|
||||
'zh': ['答案是'],
|
||||
'ko': ['답변은'],
|
||||
'pt': ['A resposta é'],
|
||||
'ja': ['答えは'],
|
||||
'es': ['La respuesta es']
|
||||
}
|
||||
|
||||
|
||||
def extract_choice(gen, lang):
|
||||
r"""
|
||||
{
|
||||
"answer": "A|B|C|D"
|
||||
}
|
||||
"""
|
||||
patterns = [
|
||||
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
|
||||
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
|
||||
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
|
||||
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
|
||||
]
|
||||
for pattern in patterns:
|
||||
res = re.findall(pattern, gen, flags=re.DOTALL)
|
||||
if len(res) >= 1:
|
||||
return res[-1]
|
||||
|
||||
else:
|
||||
res = None
|
||||
pattern = langs_dict[lang]
|
||||
for p in pattern:
|
||||
if p in gen and p != gen:
|
||||
res = gen.split(p)
|
||||
if len(res) > 1 and len(res[-1].strip()) > 0:
|
||||
res = res[-1].strip()[0]
|
||||
else:
|
||||
res = None
|
||||
|
||||
break
|
||||
|
||||
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
|
||||
if res in temp:
|
||||
return res
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract_choice_fuzzy(gen):
|
||||
options = ['A', 'B', 'C', 'D']
|
||||
for option in options:
|
||||
if option in gen:
|
||||
return option
|
||||
return None
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_mlogiqa')
|
||||
def pmmeval_mlogiqa_postprocess(text: str, lang_code: str) -> Tuple[str]:
|
||||
return text, lang_code
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalMLogiQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str):
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mlogiqa',
|
||||
split=f'test/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path, f'mlogiqa/test/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalMLogiQAEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
assert len(predictions) == len(references)
|
||||
|
||||
all_results = list()
|
||||
|
||||
for (pred, lang), ref in zip(predictions, references):
|
||||
answer = chr(int(ref) + 65)
|
||||
pred = extract_choice(pred, lang)
|
||||
acc = 0
|
||||
failed_strict = 0
|
||||
failed = 1
|
||||
if pred is not None:
|
||||
failed = 0
|
||||
if answer.lower() == pred.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
else:
|
||||
pred_fuzzy = extract_choice_fuzzy(pred)
|
||||
if pred_fuzzy is None:
|
||||
acc = 0
|
||||
failed_strict = 1
|
||||
else:
|
||||
failed_strict = 0
|
||||
if answer.lower() == pred_fuzzy.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
|
||||
all_results.append({
|
||||
'acc':
|
||||
float(acc),
|
||||
'failed':
|
||||
float(failed),
|
||||
'failed_strict':
|
||||
float(failed_strict),
|
||||
'extracted_answer':
|
||||
pred if pred else 'no answer',
|
||||
})
|
||||
|
||||
final_result = {
|
||||
'accuracy':
|
||||
round(
|
||||
sum(x['acc'] for x in all_results) / len(all_results) * 100,
|
||||
2),
|
||||
'details':
|
||||
all_results
|
||||
}
|
||||
|
||||
return final_result
|
157
opencompass/datasets/PMMEval/mmmlu.py
Executable file
157
opencompass/datasets/PMMEval/mmmlu.py
Executable file
@ -0,0 +1,157 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
langs_dict = {
|
||||
'FR-FR': ['La réponse est', 'la réponse est'],
|
||||
'EN-US': ['the answer is', 'The answer is'],
|
||||
'VI-VT': ['Câu trả lời là', 'câu trả lời là'],
|
||||
'AR-XY': ['الجواب هو'],
|
||||
'TH-TL': ['คำตอบคือ'],
|
||||
'ZH-CN': ['答案是'],
|
||||
'KO-KR': ['답변은'],
|
||||
'PT-BR': ['A resposta é'],
|
||||
'JA-JP': ['答えは'],
|
||||
'ES-LA': ['La respuesta es']
|
||||
}
|
||||
|
||||
|
||||
def extract_choice(gen, lang):
|
||||
r"""
|
||||
{
|
||||
"answer": "A|B|C|D"
|
||||
}
|
||||
"""
|
||||
patterns = [
|
||||
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
|
||||
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
|
||||
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
|
||||
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
|
||||
]
|
||||
for pattern in patterns:
|
||||
res = re.findall(pattern, gen, flags=re.DOTALL)
|
||||
if len(res) >= 1:
|
||||
return res[-1]
|
||||
|
||||
else:
|
||||
res = None
|
||||
pattern = langs_dict[lang]
|
||||
for p in pattern:
|
||||
if p in gen and p != gen:
|
||||
res = gen.split(p)
|
||||
if len(res) > 1 and len(res[-1].strip()) > 0:
|
||||
res = res[-1].strip()[0]
|
||||
else:
|
||||
res = None
|
||||
break
|
||||
|
||||
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
|
||||
if res in temp:
|
||||
return res
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract_choice_fuzzy(gen):
|
||||
options = ['A', 'B', 'C', 'D']
|
||||
for option in options:
|
||||
if option in gen:
|
||||
return option
|
||||
return None
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_mmmlu')
|
||||
def pmmeval_mmmlu_postprocess(text: str, lang_code: str) -> Tuple[str]:
|
||||
return text, lang_code
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalMMMLUDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str, difficulty: str):
|
||||
assert difficulty in [
|
||||
'easy', 'hard', 'all'
|
||||
], '`difficulty` should be one choice among "easy", "hard", and "all"!'
|
||||
data_path = get_data_path(path)
|
||||
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
dataset_list = list()
|
||||
from modelscope import MsDataset
|
||||
if difficulty == 'easy' or difficulty == 'all':
|
||||
dataset_list.append(
|
||||
MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mmmlu',
|
||||
split=f'easy/test/mmlu_{lang}'))
|
||||
if difficulty == 'hard' or difficulty == 'all':
|
||||
dataset_list.append(
|
||||
MsDataset.load(dataset_name=data_path,
|
||||
subset_name='mmmlu',
|
||||
split=f'hard/test/mmlu_{lang}'))
|
||||
# TODO: conbine two datasets
|
||||
dataset = dataset_list[0] + dataset_list[1] if len(
|
||||
dataset_list) == 2 else dataset_list[0]
|
||||
else:
|
||||
dataset = list()
|
||||
if difficulty == 'easy' or difficulty == 'all':
|
||||
filename = os.path.join(data_path,
|
||||
f'mmmlu/easy/test/mmlu_{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
if difficulty == 'hard' or difficulty == 'all':
|
||||
filename = os.path.join(data_path,
|
||||
f'mmmlu/hard/test/mmlu_{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalMMMLUEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
all_results = list()
|
||||
for (pred, lang), ref in zip(predictions, references):
|
||||
answer = extract_choice(pred, lang)
|
||||
if answer is None:
|
||||
answer = extract_choice_fuzzy(pred)
|
||||
if answer is None:
|
||||
acc = 0.0
|
||||
failed = 1.0
|
||||
else:
|
||||
acc = 1.0 if ref.lower() == answer.lower() else 0.0
|
||||
failed = 0.0
|
||||
|
||||
all_results.append({
|
||||
'acc':
|
||||
acc,
|
||||
'failed':
|
||||
failed,
|
||||
'extracted_answer':
|
||||
pred if pred else 'no answer'
|
||||
})
|
||||
|
||||
final_result = {
|
||||
'accuracy':
|
||||
round(
|
||||
sum(x['acc'] for x in all_results) / len(all_results) * 100,
|
||||
2),
|
||||
'details':
|
||||
all_results
|
||||
}
|
||||
|
||||
return final_result
|
150
opencompass/datasets/PMMEval/xnli.py
Executable file
150
opencompass/datasets/PMMEval/xnli.py
Executable file
@ -0,0 +1,150 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.datasets.base import BaseDataset
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
langs_dict = {
|
||||
'fr': ['La réponse est', 'la réponse est'],
|
||||
'en': ['the answer is', 'The answer is'],
|
||||
'vi': ['Câu trả lời là', 'câu trả lời là'],
|
||||
'ar': ['الجواب هو'],
|
||||
'th': ['คำตอบคือ'],
|
||||
'zh': ['答案是'],
|
||||
'ko': ['답변은'],
|
||||
'pt': ['A resposta é'],
|
||||
'ja': ['答えは'],
|
||||
'id': ['Jawaban adalah', 'jawaban adalah'],
|
||||
'es': ['La respuesta es']
|
||||
}
|
||||
|
||||
|
||||
def extract_choice(gen, lang):
|
||||
r"""
|
||||
{
|
||||
"answer": "A|B|C|D"
|
||||
}
|
||||
"""
|
||||
patterns = [
|
||||
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
|
||||
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
|
||||
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
|
||||
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
|
||||
]
|
||||
for pattern in patterns:
|
||||
res = re.findall(pattern, gen, flags=re.DOTALL)
|
||||
if len(res) >= 1:
|
||||
return res[-1]
|
||||
|
||||
else:
|
||||
res = None
|
||||
pattern = langs_dict[lang]
|
||||
for p in pattern:
|
||||
if p in gen and p != gen:
|
||||
res = gen.split(p)
|
||||
if len(res) > 1 and len(res[-1].strip()) > 0:
|
||||
res = res[-1].strip()[0]
|
||||
else:
|
||||
res = None
|
||||
break
|
||||
|
||||
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
|
||||
if res in temp:
|
||||
return res
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract_choice_fuzzy(gen, lang):
|
||||
options = ['A', 'B', 'C', 'D'] # 定义选项
|
||||
for option in options:
|
||||
if option in gen: # 检查选项是否在文本中
|
||||
return option # 返回第一个出现的选项
|
||||
return None
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('pmmeval_xnli')
|
||||
def pmmeval_xnli_postprocess(text: str, lang_code: str) -> Tuple[str]:
|
||||
return text, lang_code
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class PMMEvalXNLIDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, lang: str):
|
||||
data_path = get_data_path(path)
|
||||
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(dataset_name=data_path,
|
||||
subset_name='xnli',
|
||||
split=f'test/{lang}')
|
||||
else:
|
||||
dataset = list()
|
||||
filename = os.path.join(data_path, f'xnli/test/{lang}.jsonl')
|
||||
with open(filename, mode='r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line.strip())
|
||||
dataset.append(line)
|
||||
dataset = Dataset.from_list(dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class PMMEvalXNLIEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
assert len(predictions) == len(references)
|
||||
|
||||
all_results = list()
|
||||
|
||||
for (pred, lang), ref in zip(predictions, references):
|
||||
choice = extract_choice(pred, lang)
|
||||
acc = 0
|
||||
failed_strict = 0
|
||||
failed = 1
|
||||
if choice is not None:
|
||||
failed = 0
|
||||
if ref.lower() == choice.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
else:
|
||||
choice = extract_choice_fuzzy(pred, lang)
|
||||
if choice is None:
|
||||
acc = 0
|
||||
failed_strict = 1
|
||||
else:
|
||||
failed_strict = 0
|
||||
if ref.lower() == choice.lower():
|
||||
acc = 1
|
||||
else:
|
||||
acc = 0
|
||||
|
||||
all_results.append({
|
||||
'acc':
|
||||
float(acc),
|
||||
'failed':
|
||||
float(failed),
|
||||
'failed_strict':
|
||||
float(failed_strict),
|
||||
'extracted_answer':
|
||||
choice if choice else 'no answer',
|
||||
})
|
||||
|
||||
final_result = {
|
||||
'accuracy':
|
||||
round(
|
||||
sum(x['acc'] for x in all_results) / len(all_results) * 100,
|
||||
2),
|
||||
'details':
|
||||
all_results
|
||||
}
|
||||
|
||||
return final_result
|
@ -343,6 +343,11 @@ DATASETS_MAPPING = {
|
||||
"hf_id": "",
|
||||
"local": "./data/babilong/data/",
|
||||
},
|
||||
"P-MMEval": {
|
||||
"ms_id": "",
|
||||
"hf_id": "",
|
||||
"local": "./data/P-MMEval/",
|
||||
},
|
||||
"opencompass/arc_prize_public_evaluation": {
|
||||
"ms_id": "",
|
||||
"hf_id": "",
|
||||
@ -530,7 +535,7 @@ DATASETS_URL = {
|
||||
"/cmo": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
|
||||
"md5": "fad52c81290506a8ca74f46b5400d8fc",
|
||||
},
|
||||
},
|
||||
"/nq-open": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
|
||||
"md5": "a340521e5c9ec591227dcb367f718b25",
|
||||
@ -566,5 +571,9 @@ DATASETS_URL = {
|
||||
"/arc_prize_public_evaluation": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip",
|
||||
"md5": "367a33977651496efddba7670009807e"
|
||||
},
|
||||
"P-MMEval": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip",
|
||||
"md5": "589c8be1551a609d94231f1410cf22eb",
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user