[Feature] Add P-MMEval (#1714)

* Update with PMMEval

* Update

* Update __init__.py

* Fix Bugs

* Delete .pre-commit-config.yaml

* Pull merge

---------

Co-authored-by: liushz <qq1791167085@163.com>
This commit is contained in:
wanyu2018umac 2024-11-27 21:26:18 +08:00 committed by GitHub
parent f7dbe6bb7d
commit 90efcf2216
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 2200 additions and 1 deletions

32
configs/eval_PMMEval.py Executable file
View File

@ -0,0 +1,32 @@
from mmengine.config import read_base
from opencompass.models import HuggingFacewithChatTemplate
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models
# from opencompass.configs.datasets.PMMEval.flores_gen import PMMEval_flores_datasets
# from opencompass.configs.datasets.PMMEval.humanevalxl_gen import PMMEval_HumanEvalXL_datasets
# from opencompass.configs.datasets.PMMEval.mgsm_gen import PMMEval_MGSM_datasets
# from opencompass.configs.datasets.PMMEval.mhellaswag_gen import PMMEval_MHellaswag_datasets
# from opencompass.configs.datasets.PMMEval.mifeval_gen import PMMEval_MIFEval_datasets
# from opencompass.configs.datasets.PMMEval.mlogiqa_gen import PMMEval_MLogiQA_datasets
# from opencompass.configs.datasets.PMMEval.mmmlu_gen import PMMEval_MMMLU_datasets
# from opencompass.configs.datasets.PMMEval.xnli import PMMEval_XNLI_datasets
from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
from opencompass.configs.summarizers.PMMEval import summarizer
# datasets = PMMEval_flores_datasets
# datasets = PMMEval_HumanEvalXL_datasets
# datasets = PMMEval_MGSM_datasets
# datasets = PMMEval_MHellaswag_datasets
# datasets = PMMEval_MIFEval_datasets
# datasets = PMMEval_MLogiQA_datasets
# datasets = PMMEval_MMMLU_datasets
# datasets = PMMEval_XNLI_datasets
datasets = PMMEval_datasets

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .flores_gen_2697d7 import PMMEval_flores_datasets

View File

@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalFloresDataset, PMMEvalFloresEvaluator, pmmeval_flores_postprocess
NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
PROMPT = {
"Chinese": "将这个句子从英语翻译成中文。\n\n{src}",
"Arabic": "ترجم هذه الجملة من الإنجليزية إلى العربية.\n\n{src}",
"Spanish": "Traduce esta oración del inglés al español.\n\n{src}",
"Japanese": "この文を英語から日本語に翻訳してください。\n\n{src}",
"Korean": "이 문장을 영어에서 한국어로 번역하세요.\n\n{src}",
"Thai": "แปลประโยคนี้จากภาษาอังกฤษเป็นภาษาไทย.\n\n{src}",
"French": "Traduisez cette phrase de l'anglais en français.\n\n{src}",
"Portuguese": "Traduza esta frase do inglês para o português.\n\n{src}",
"Vietnamese": "Dịch câu này từ tiếng Anh sang tiếng Việt.\n\n{src}"
}
PMMEval_flores_datasets = list()
# Add flores_200
PMMEval_flores_reader_cfg = dict(
input_columns=['src'],
output_column='tgt',
test_split='test'
)
PMMEval_flores_datasets = list()
for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES:
PMMEval_flores_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PROMPT[lang_fullname]
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
PMMEval_flores_eval_cfg = dict(
evaluator=dict(type=PMMEvalFloresEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_flores_postprocess, lang_fullname=lang_fullname)
)
PMMEval_flores_datasets.append(
dict(
abbr=f'flores-{lang_fullname}',
type=PMMEvalFloresDataset,
path='P-MMEval',
lang_fullname=lang_fullname,
reader_cfg=PMMEval_flores_reader_cfg,
infer_cfg=PMMEval_flores_infer_cfg,
eval_cfg=PMMEval_flores_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets

View File

@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalHumanEvalXLDataset, PMMEvalHumanEvalXLEvaluator
NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
PMMEval_HumanEvalXL_datasets = list()
PMMEval_HumanEvalXL_reader_cfg = dict(
input_columns=['task_id', 'prompt', 'entry_point', 'test', 'language', 'description', 'natural_language'],
output_column='declaration',
test_split='test'
)
PMMEval_HumanEvalXL_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
PMMEval_HumanEvalXL_datasets = list()
for lang_fullname in NATURAL_LANGUAGE_FULLNAMES:
for program_lang in ['python', 'java', 'javascript']:
PMMEval_HumanEvalXL_eval_cfg = dict(
evaluator=dict(
type=PMMEvalHumanEvalXLEvaluator,
language=program_lang,
text_language=lang_fullname,
ip_address='localhost',
port=5001),
pred_role='BOT')
PMMEval_HumanEvalXL_datasets.append(
dict(
abbr=f'humanevalxl-{program_lang}-{lang_fullname}',
type=PMMEvalHumanEvalXLDataset,
path='P-MMEval',
lang=lang_fullname,
program_lang=program_lang,
reader_cfg=PMMEval_HumanEvalXL_reader_cfg,
infer_cfg=PMMEval_HumanEvalXL_infer_cfg,
eval_cfg=PMMEval_HumanEvalXL_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mgsm_gen_679720 import PMMEval_MGSM_datasets

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMGSMDataset, PMMEvalMGSMEvaluator
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
LANG_TO_INSTRUCTIONS = {
"en": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"The answer is \". Do not add anything other than the integer answer after \"The answer is \".\n\n{question}",
"es": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La respuesta es \". Do not add anything other than the integer answer after \"La respuesta es \".\n\n{question}",
"fr": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La réponse est \". Do not add anything other than the integer answer after \"La réponse est \".\n\n{question}",
"zh": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答案是 \". Do not add anything other than the integer answer after \"答案是 \".\n\n{question}",
"ja": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答えは \". Do not add anything other than the integer answer after \"答えは \".\n\n{question}",
"th": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"คำตอบคือ \". Do not add anything other than the integer answer after \"คำตอบคือ \".\n\n{question}",
"ko": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"답변은 \". Do not add anything other than the integer answer after \"답변은 \".\n\n{question}",
"pt": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"A resposta é \". Do not add anything other than the integer answer after \"A resposta é \".\n\n{question}",
"vi": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"Câu trả lời là \". Do not add anything other than the integer answer after \"Câu trả lời là \".\n\n{question}",
"ar": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"الجواب هو \". Do not add anything other than the integer answer after \"الجواب هو \".\n\n{question}"
}
PMMEval_MGSM_datasets = list()
# Add flores_200
PMMEval_MGSM_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
test_split='test'
)
PMMEval_MGSM_eval_cfg = dict(
evaluator=dict(type=PMMEvalMGSMEvaluator),
pred_role='BOT')
for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MGSM_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=LANG_TO_INSTRUCTIONS[lang_code]
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
PMMEval_MGSM_datasets.append(
dict(
abbr=f'mgsm-{lang_code}',
type=PMMEvalMGSMDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MGSM_reader_cfg,
infer_cfg=PMMEval_MGSM_infer_cfg,
eval_cfg=PMMEval_MGSM_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets

View File

@ -0,0 +1,54 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMHellaswagDataset, PMMEvalMHellaswagEvaluator, pmmeval_mhellaswag_postprocess
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
PMMEVAL_MHELLASWAG_TEMPLATE = "Input: {ctx}\nOptions: \nA. {option_1}\nB. {option_2}\nC. {option_3}\nD. {option_4}\nPick the correct ending for the sentence from A, B, C, and D, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C or D."
PMMEval_MHellaswag_datasets = list()
PMMEval_MHellaswag_reader_cfg = dict(
input_columns=['ctx', 'option_1', 'option_2', 'option_3', 'option_4'],
output_column='label',
test_split='test'
)
PMMEval_MHellaswag_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MHELLASWAG_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
PMMEval_MHellaswag_datasets = list()
for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MHellaswag_eval_cfg = dict(
evaluator=dict(type=PMMEvalMHellaswagEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mhellaswag_postprocess, lang_code=lang_code)
)
PMMEval_MHellaswag_datasets.append(
dict(
abbr=f'mhellaswag-{lang_code}',
type=PMMEvalMHellaswagDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MHellaswag_reader_cfg,
infer_cfg=PMMEval_MHellaswag_infer_cfg,
eval_cfg=PMMEval_MHellaswag_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets

View File

@ -0,0 +1,51 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMIFEvalDataset, PMMEvalMIFEvalEvaluator, pmmeval_mifeval_postprocess
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
PMMEVAL_MIFEVAL_TEMPLATE = "{prompt}"
PMMEval_MIFEval_datasets = list()
PMMEval_MIFEval_reader_cfg = dict(
input_columns=['prompt', 'instruction_id_list', 'kwargs'],
output_column=None,
test_split='test'
)
PMMEval_MIFEval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MIFEVAL_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MIFEval_eval_cfg = dict(
evaluator=dict(type=PMMEvalMIFEvalEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mifeval_postprocess, lang_code=lang_code)
)
PMMEval_MIFEval_datasets.append(
dict(
abbr=f'mifeval-{lang_code}',
type=PMMEvalMIFEvalDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MIFEval_reader_cfg,
infer_cfg=PMMEval_MIFEval_infer_cfg,
eval_cfg=PMMEval_MIFEval_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets

View File

@ -0,0 +1,50 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMLogiQADataset, PMMEvalMLogiQAEvaluator, pmmeval_mlogiqa_postprocess
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
PMMEVAL_MLOGIQA_TEMPLATE = "Passage: {context}\nQuestion: {question}\nChoices:\nA.{option_1}\nB.{option_2}\nC.{option_3}\nD.{option_4}\nPlease choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{'answer': '[choice]'}\nwhere [choice] must be one of A, B, C and D."
PMMEval_MLogiQA_datasets = []
PMMEval_MLogiQA_reader_cfg = dict(
input_columns=['context', 'question', 'option_1', 'option_2', 'option_3', 'option_4'],
output_column='answer',
train_split='test')
PMMEval_MLogiQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MLOGIQA_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MLogiQA_eval_cfg = dict(
evaluator=dict(type=PMMEvalMLogiQAEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mlogiqa_postprocess, lang_code=lang_code))
PMMEval_MLogiQA_datasets.append(
dict(
abbr=f'mlogiqa-{lang_code}',
type=PMMEvalMLogiQADataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MLogiQA_reader_cfg,
infer_cfg=PMMEval_MLogiQA_infer_cfg,
eval_cfg=PMMEval_MLogiQA_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets

View File

@ -0,0 +1,52 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMMMLUDataset, PMMEvalMMMLUEvaluator, pmmeval_mmmlu_postprocess
NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT']
PMMEVAL_MMMLU_TEMPLATE = "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C and D.\n\n{Question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
PMMEval_MMMLU_datasets = []
PMMEval_MMMLU_reader_cfg = dict(
input_columns=['Question', 'A', 'B', 'C', 'D'],
output_column='Answer',
train_split='test')
PMMEval_MMMLU_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MMMLU_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
for lang_code in NATURAL_LANGUAGE_CODES_MMMLU:
PMMEval_MMMLU_eval_cfg = dict(
evaluator=dict(type=PMMEvalMMMLUEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mmmlu_postprocess, lang_code=lang_code))
PMMEval_MMMLU_datasets.append(
dict(
abbr=f'mmmlu-{lang_code}',
type=PMMEvalMMMLUDataset,
path='P-MMEval',
lang=lang_code,
difficulty='all',
reader_cfg=PMMEval_MMMLU_reader_cfg,
infer_cfg=PMMEval_MMMLU_infer_cfg,
eval_cfg=PMMEval_MMMLU_eval_cfg)
)

View File

@ -0,0 +1,14 @@
from mmengine.config import read_base
with read_base():
from .flores_gen_2697d7 import PMMEval_flores_datasets
from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets
from .mgsm_gen_679720 import PMMEval_MGSM_datasets
from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets
from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets
from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets
from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets
from .xnli_gen_973734 import PMMEval_XNLI_datasets
PMMEval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .xnli_gen_973734 import PMMEval_XNLI_datasets

View File

@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalXNLIDataset, PMMEvalXNLIEvaluator, pmmeval_xnli_postprocess
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
PMMEVAL_XNLI_TEMPLATE = """Take the following as truth: {premise}
Then the following statement: \"{statement}\" is
Options:
A. true
B. inconclusive
C. false
Select the correct option from A, B, and C, and return it in the following JSON format:
{"answer": "[choice]"}
where [choice] must be one of A, B, and C."""
PMMEval_XNLI_datasets = list()
# Add flores_200
PMMEval_XNLI_reader_cfg = dict(
input_columns=['premise', 'statement'],
output_column='answer',
test_split='test'
)
PMMEval_XNLI_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_XNLI_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_XNLI_eval_cfg = dict(
evaluator=dict(type=PMMEvalXNLIEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_xnli_postprocess, lang_code=lang_code))
PMMEval_XNLI_datasets.append(
dict(
abbr=f'xnli-{lang_code}',
type=PMMEvalXNLIDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_XNLI_reader_cfg,
infer_cfg=PMMEval_XNLI_infer_cfg,
eval_cfg=PMMEval_XNLI_eval_cfg)
)

View File

@ -0,0 +1,22 @@
from mmengine.config import read_base
with read_base():
from .groups.PMMEval import PMMEval_summary_groups
summarizer = dict(
dataset_abbrs=[
'flores',
'humanevalxl',
'mgsm',
'mhellaswag',
'mifeval',
'mlogiqa',
'mmmlu',
'xnli'
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)

View File

@ -0,0 +1,41 @@
NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT']
PMMEval_summary_groups = [
{
'name': 'flores',
'subsets': [f'flores-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES]
},
{
'name': 'humanevalxl',
'subsets': [f'humanevalxl-python-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \
[f'humanevalxl-java-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \
[f'humanevalxl-javascript-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES]
},
{
'name': 'mgsm',
'subsets': [f'mgsm-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
},
{
'name': 'mhellaswag',
'subsets': [f'mhellaswag-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
},
{
'name': 'mifeval',
'subsets': [f'mifeval-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
},
{
'name': 'mlogiqa',
'subsets': [f'mlogiqa-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
},
{
'name': 'mmmlu',
'subsets': [f'mmmlu-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES_MMMLU]
},
{
'name': 'xnli',
'subsets': [f'xnli-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES]
}
]

View File

@ -0,0 +1,8 @@
from .flores import * # noqa: F401, F403
from .humanevalxl import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403
from .mhellaswag import * # noqa: F401, F403
from .mifeval import * # noqa: F401, F403
from .mlogiqa import * # noqa: F401, F403
from .mmmlu import * # noqa: F401, F403
from .xnli import * # noqa: F401, F403

View File

@ -0,0 +1,162 @@
import json
import os
import re
from typing import Tuple
import numpy as np
from datasets import Dataset
from sacrebleu.metrics import BLEU
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
def wmt_postprocess(text: str, lang: str) -> str:
text = text.strip()
texts = list(x.strip() for x in text.split('\n'))
texts = list(x for x in texts if x != '')
text = '\n'.join(texts)
text = tokenize(text, lang)
return text
def compute_maximum_bleu_value(gen: str, ref: str, lang: str):
gens = list(x.strip() for x in gen.split('\n'))
gens = list(x for x in gens if x != '')
gens_tokens = list(wmt_postprocess(x, lang) for x in gens)
ref_tokens = wmt_postprocess(ref, lang)
scorer = BLEU(tokenize='13a', effective_order=True)
maximum_bleu_value = -100.0
maximum_bleu_object = None
for i in range(0, len(gens_tokens)):
for j in range(i, len(gens_tokens)):
gens_tokens_region = ' '.join(gens_tokens[i:j + 1])
sentence_bleu = scorer.sentence_score(gens_tokens_region,
[ref_tokens])
if sentence_bleu.score > maximum_bleu_value:
maximum_bleu_value = sentence_bleu.score
maximum_bleu_object = sentence_bleu
if maximum_bleu_object is None:
sentence_bleu = scorer.sentence_score('', [ref_tokens])
return sentence_bleu
else:
return maximum_bleu_object
def trim_multiple_space(tokes):
return ''.join(tokes).strip().split()
class SpaceTokenizer(object):
def __call__(self, sent):
if type(sent) == list:
print(sent)
raise ValueError()
return ' '.join(sent.strip().split())
class NonASCIITokenizer(object):
def __init__(self):
self.is_cjk = re.compile('([\u2e80-\u9fff]|' # 中日韩
'[\ua960-\ua97f]|' # 谚文字母扩展A
'[\uac00-\ud7ff]|' # 谚文音节+谚文字母扩展B
'[\u0E00-\u0E7F]' # 泰文
')')
def __call__(self, sent):
sent = sent.strip()
chs = list(sent)
line_chtok = []
for ch in chs:
if self.is_cjk.match(ch):
line_chtok.append(' ')
line_chtok.append(ch)
line_chtok.append(' ')
else:
line_chtok.append(ch)
line_chtok = trim_multiple_space(line_chtok)
return ' '.join(line_chtok)
def build_tokenizer(lang: str):
if lang == 'Chinese':
return TokenizerZh()
elif lang in {'Japanese', 'Korean', 'Thai'}:
return NonASCIITokenizer()
else:
return SpaceTokenizer()
def tokenize(sent, lang):
tokenizer = build_tokenizer(lang)
final_tokenizer = Tokenizer13a()
return final_tokenizer(tokenizer(sent))
@TEXT_POSTPROCESSORS.register_module('pmmeval_flores')
def pmmeval_flores_postprocess(text: str, lang_fullname: str) -> Tuple[str]:
return text, lang_fullname
@LOAD_DATASET.register_module()
class PMMEvalFloresDataset(BaseDataset):
@staticmethod
def load(path: str, lang_fullname: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='flores',
split=f'test/{lang_fullname}')
else:
dataset = list()
filename = os.path.join(data_path,
f'flores/test/{lang_fullname}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalFloresEvaluator(BaseEvaluator):
def score(self, predictions, references):
maximum_bleu_results = list()
for (pred, tgt_lang), ref in zip(predictions, references):
maximum_bleu_results.append(
compute_maximum_bleu_value(pred, ref, tgt_lang))
maximum_corpus_bleu_counts = sum(
np.array(x.counts) for x in maximum_bleu_results).tolist()
maximum_corpus_bleu_totals = sum(
np.array(x.totals) for x in maximum_bleu_results).tolist()
maximum_corpus_bleu_sys_len = sum(x.sys_len
for x in maximum_bleu_results)
maximum_corpus_bleu_ref_len = sum(x.ref_len
for x in maximum_bleu_results)
maximum_bleu_result = BLEU.compute_bleu(
correct=maximum_corpus_bleu_counts,
total=maximum_corpus_bleu_totals,
sys_len=maximum_corpus_bleu_sys_len,
ref_len=maximum_corpus_bleu_ref_len)
result = {'BLEU': round(maximum_bleu_result.score, 2)}
return result

View File

@ -0,0 +1,226 @@
import json
import os
import os.path as osp
import re
import subprocess
import tempfile
import time
from shutil import copyfile
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.datasets.humaneval import humaneval_postprocess_v2
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
_LANGUAGE_NAME_DICT = {
'java': 'Java',
'javascript': 'JavaScript',
'js': 'JavaScript',
'python': 'Python',
}
@LOAD_DATASET.register_module()
class PMMEvalHumanEvalXLDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str, program_lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='humaneval-xl',
split=f'test/{program_lang}/{lang}')
else:
dataset = list()
filename = os.path.join(
data_path, f'humaneval-xl/test/{program_lang}/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalHumanEvalXLEvaluator(BaseEvaluator):
def __init__(self,
language,
ip_address='localhost',
text_language='',
port='',
retry=2,
timeout=600) -> None:
assert language in _LANGUAGE_NAME_DICT.keys(), (
f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}')
if language == 'rust':
timeout *= 10 # rust need more time
self.language = language
self.text_language = text_language
self.ip_address = ip_address
self.port = port
self.retry = retry
self.timeout = timeout
super().__init__()
def score(self, predictions, references):
predictions = [{
'task_id':
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
'generation':
_clean_up_code(pred, self.language, refer),
} for i, (pred, refer) in enumerate(zip(predictions, references))]
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_out_path = osp.join(
tmp_dir,
f'humanevalx_{self.language}_{self.text_language}.json')
with open(tmp_out_path, 'w') as f:
for pred in predictions:
f.write(json.dumps(pred) + '\n')
num_retry = 0
while num_retry < self.retry:
succeed, output = self._code_eval_service(
file_path=tmp_out_path)
if not succeed and '(56) Recv failure' in output:
# only retry when connection failed
num_retry += 1
# wait a min in case the service load is too high
time.sleep(60)
else:
break
if succeed:
if isinstance(output, str):
return json.loads(output)
elif isinstance(output, dict):
return output
ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html' # noqa
if hasattr(self, '_out_dir'):
result_file_path = re.sub('results', 'mid_results',
self._out_dir) + '.json' # noqa
if not osp.exists(osp.dirname(result_file_path)):
os.makedirs(osp.dirname(result_file_path))
else:
result_file_path = os.path.join(
'outputs', f'humanevalx_{self.language}.json')
copyfile(tmp_out_path, result_file_path)
raise Exception(
f'Call CodeEvalService Error in `HumanevalXEvaluator`, The '
f"results have been saved in path '{result_file_path}', You "
'need to check that your code evaluate service is launched and'
f' the network to service is connected, you can also get '
f'results directly by using `curl` command refer to {ref_url}.'
f'\nError Information: {output}')
def _code_eval_service(self, file_path):
if self.port:
eval_server_url = f'{self.ip_address}:{self.port}/evaluate'
else:
eval_server_url = f'{self.ip_address}/evaluate'
exec_result = subprocess.run([
'curl', '-X', 'POST', '-F', f'file=@{file_path}', '-F',
f'dataset=humanevalx/{self.language}', f'{eval_server_url}'
],
timeout=self.timeout,
capture_output=True)
if exec_result.returncode == 0 and re.match(
"\"{.*:.*}\"", exec_result.stdout.decode('utf-8')):
return True, json.loads(exec_result.stdout.decode('utf-8'))
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except Exception:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except Exception:
err = exec_result.stdout
return False, err
def _clean_up_code(text: str, language_type: str, reference) -> str:
"""Cleans up the generated code."""
try:
# for chatGLM related text
eval_text = eval(text)
except Exception:
pass
else:
if isinstance(eval_text, str):
text = eval_text
# extract code from code block
text = text.lstrip('\n')
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```xxx
text = text[max(text.find('\n') + 1, 0):]
if language_type.lower() == 'python':
text = humaneval_postprocess_v2(text)
# we need to take care of the first line
# append extra space for first line for correct indentation
text = ' ' + text.lstrip()
text_splits = text.split('\n')
is_empty_line = False
ind_empty_line = None
for i, line in enumerate(text_splits):
if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
is_empty_line = True
ind_empty_line = i
break
if is_empty_line:
text = '\n'.join(text_splits[:ind_empty_line])
else:
end_words = [
'\ndef', '\nclass', '\n#', '\nassert', '\n"""', '\nprint',
'\nif', '\n\n\n'
]
for w in end_words:
if w in text:
text = text[:text.rfind(w)]
# strip function head for all other language
func_name = reference.strip().split('\n')[-1]
if func_name:
func_name = func_name.strip().strip('{')
if func_name in text:
text = '\n'.join(text[text.find(func_name):].split('\n')[1:])
if language_type.lower() == 'java':
main_pos = text.find('public static void main')
if main_pos != -1:
text = text[:main_pos] + '}'
if '}' in text:
text = text[:text.rfind('}')] + '}'
if text.count('{') + 1 == text.count('}'):
text += '\n}'
elif language_type.lower() == 'go':
if '\nfunc main(' in text:
text = text[:text.rfind('func main(')]
if '}' in text:
text = text[:text.rfind('}')] + '}'
elif language_type.lower() == 'cpp':
if '\nint main()' in text:
text = text[:text.rfind('int main()')]
if '}' in text:
text = text[:text.rfind('}')] + '}'
elif language_type.lower() == 'js':
if '}' in text:
text = text[:text.rfind('}')] + '}'
elif language_type.lower() == 'rust':
if '}' in text:
text = text[:text.rfind('}')] + '}'
return text

View File

@ -0,0 +1,79 @@
import json
import os
import re
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
def _get_last_digit(s):
_PAT_LAST_DIGIT = re.compile(
r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)' # noqa E501
)
match = list(_PAT_LAST_DIGIT.finditer(s))
if match:
last_digit = match[-1].group().replace(',', '').replace(
'+', '').strip().strip('.')
# print(f"The last digit in {s} is {last_digit}")
else:
last_digit = None
# logger.warning(f"No digits found in {s!r}")
return last_digit
@LOAD_DATASET.register_module()
class PMMEvalMGSMDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='mgsm',
split=f'test/{lang}')
else:
dataset = list()
filename = os.path.join(data_path, f'mgsm/test/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalMGSMEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
num_correct, total = 0, 0
details = {}
for index, (references_answer, predictions_answer) in enumerate(
zip(references, predictions)):
extracted_answer = _get_last_digit(predictions_answer)
references_answer = references_answer.replace(',', '')
if references_answer == extracted_answer:
is_correct = True
else:
is_correct = False
num_correct += is_correct
total += 1
details[str(index)] = {
'references': references_answer,
'predictions': predictions_answer,
'extracted': extracted_answer,
'correct': is_correct,
}
accuracy = round(num_correct / total * 100, 2)
final_result = {'accuracy': accuracy, 'details': details}
return final_result

View File

@ -0,0 +1,151 @@
import json
import os
import re
from typing import Tuple
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
langs_dict = {
'fr': ['La réponse est', 'la réponse est'],
'en': ['the answer is', 'The answer is'],
'vi': ['Câu trả lời là', 'câu trả lời là'],
'ar': ['الجواب هو'],
'th': ['คำตอบคือ'],
'zh': ['答案是'],
'ko': ['답변은'],
'pt': ['A resposta é'],
'ja': ['答えは'],
'es': ['La respuesta es']
}
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
]
for pattern in patterns:
res = re.findall(pattern, gen, flags=re.DOTALL)
if len(res) >= 1:
return res[-1]
else:
res = None
pattern = langs_dict[lang]
for p in pattern:
if p in gen and p != gen:
res = gen.split(p)
if len(res) > 1 and len(res[-1].strip()) > 0:
res = res[-1].strip()[0]
else:
res = None
break
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
if res in temp:
return res
else:
return None
def extract_choice_fuzzy(gen, lang):
options = ['A', 'B', 'C', 'D'] # 定义选项
for option in options:
if option in gen: # 检查选项是否在文本中
return option # 返回第一个出现的选项
return None
@TEXT_POSTPROCESSORS.register_module('pmmeval_mhellaswag')
def pmmeval_mhellaswag_postprocess(text: str, lang_code: str) -> Tuple[str]:
return text, lang_code
@LOAD_DATASET.register_module()
class PMMEvalMHellaswagDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='mhellaswag',
split=f'test/{lang}')
else:
dataset = list()
filename = os.path.join(data_path, f'mhellaswag/test/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalMHellaswagEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
all_results = list()
for (pred, lang), ref in zip(predictions, references):
answer = chr(int(ref) + 65)
choice = extract_choice(pred, lang)
acc = 0
failed_strict = 0
failed = 1
if choice is not None:
failed = 0
if answer.lower() == choice.lower():
acc = 1
else:
acc = 0
else:
choice = extract_choice_fuzzy(pred, lang)
if choice is None:
acc = 0
failed_strict = 1
else:
failed_strict = 0
if answer.lower() == choice.lower():
acc = 1
else:
acc = 0
all_results.append({
'acc':
float(acc),
'failed':
float(failed),
'failed_strict':
float(failed_strict),
'extracted_answer':
pred if pred else 'no answer',
})
final_result = {
'accuracy':
round(
sum(x['acc'] for x in all_results) / len(all_results) * 100,
2),
'details':
all_results
}
return final_result

View File

@ -0,0 +1,147 @@
import json
import os
from typing import Tuple
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.datasets.PMMEval.mifeval_utils import mifeval_class_map
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
def test_instruction_following_strict(inp, response, lang_code):
"""Tests response to see if instrutions are followed."""
instruction_list = inp['instruction_id_list']
is_following_list = []
for index, instruction_id in enumerate(instruction_list):
instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':'))
instruction_fuction_info = mifeval_class_map[instruction_id_0][
instruction_id_1]
instruction_function = instruction_fuction_info['function']
instruction_function_args = dict()
if instruction_fuction_info['required_lang_code']:
instruction_function_args['lang_code'] = lang_code
for kwarg_dict in inp['kwargs']:
for k, v in kwarg_dict.items():
if v is None:
continue
instruction_function_args[k] = v
instruction_function_args['input_string'] = response
if response.strip() and instruction_function(
**instruction_function_args):
is_following_list.append(True)
else:
is_following_list.append(False)
return 1.0 if all(is_following_list) else 0.0
def test_instruction_following_loose(inp, response, lang_code):
"""Tests response for an upper bound for following instructions."""
r = response.split('\n')
response_remove_first = '\n'.join(r[1:]).strip()
response_remove_last = '\n'.join(r[:-1]).strip()
response_remove_both = '\n'.join(r[1:-1]).strip()
revised_response = response.replace('*', '')
revised_response_remove_first = response_remove_first.replace('*', '')
revised_response_remove_last = response_remove_last.replace('*', '')
revised_response_remove_both = response_remove_both.replace('*', '')
all_responses = [
response,
revised_response,
response_remove_first,
response_remove_last,
response_remove_both,
revised_response_remove_first,
revised_response_remove_last,
revised_response_remove_both,
]
instruction_list = inp['instruction_id_list']
is_following_list = []
for index, instruction_id in enumerate(instruction_list):
instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':'))
instruction_fuction_info = mifeval_class_map[instruction_id_0][
instruction_id_1]
instruction_function = instruction_fuction_info['function']
instruction_function_args = dict()
if instruction_fuction_info['required_lang_code']:
instruction_function_args['lang_code'] = lang_code
for kwarg_dict in inp['kwargs']:
for k, v in kwarg_dict.items():
instruction_function_args[k] = v
instruction_function_args['input_string'] = response
is_following = False
for r in all_responses:
if r.strip() and instruction_function(**instruction_function_args):
is_following = True
break
is_following_list.append(is_following)
return 1.0 if all(is_following_list) else 0.0
@TEXT_POSTPROCESSORS.register_module('pmmeval_mifeval')
def pmmeval_mifeval_postprocess(text: str, lang_code: str) -> Tuple[str]:
return text, lang_code
@LOAD_DATASET.register_module()
class PMMEvalMIFEvalDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='mifeval',
split=f'test/{lang}')
else:
dataset = list()
filename = os.path.join(data_path, f'mifeval/test/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalMIFEvalEvaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
all_results = list()
for (pred, lang), example in zip(predictions, test_set):
temp_result = {
'strict_acc':
test_instruction_following_strict(example, pred, lang),
'loose_acc':
test_instruction_following_loose(example, pred, lang)
}
all_results.append(temp_result)
result = {
'strict_acc':
round(
sum(x['strict_acc']
for x in all_results) / len(all_results) * 100, 2),
'loose_acc':
round(
sum(x['loose_acc']
for x in all_results) / len(all_results) * 100, 2)
}
return result

View File

@ -0,0 +1,17 @@
from .combination_checker import combination_checker
from .detectable_content_checker import detectable_content_checker
from .detectable_format_checker import detectable_format_checker
from .keywords_checker import keywords_checker
from .length_constraints_checker import length_constraints_checker
from .punctuation_checker import punctuation_checker
from .startend_checker import startend_checker
mifeval_class_map = {
'combination': combination_checker,
'detectable_content': detectable_content_checker,
'detectable_format': detectable_format_checker,
'keywords': keywords_checker,
'length_constraints': length_constraints_checker,
'punctuation': punctuation_checker,
'startend': startend_checker
}

View File

@ -0,0 +1,32 @@
def repeat_prompt_checker(input_string: str, prompt_to_repeat: str, **kwargs):
if input_string.strip().lower().startswith(
prompt_to_repeat.strip().lower()):
return True
return False
def two_responses_checker(input_string: str, **kwargs):
valid_responses = list()
responses = input_string.split('******')
for index, response in enumerate(responses):
if not response.strip():
if index != 0 and index != len(responses) - 1:
return False
else:
valid_responses.append(response)
return (len(valid_responses) == 2
and valid_responses[0].strip() != valid_responses[1].strip())
combination_checker = {
'repeat_prompt': {
'function': repeat_prompt_checker,
'required_lang_code': False,
'num_of_params': 2
},
'two_responses': {
'function': two_responses_checker,
'required_lang_code': False,
'num_of_params': 1
}
}

View File

@ -0,0 +1,30 @@
import re
def number_placeholders_checker(input_string: str, num_placeholders: int,
**kwargs):
placeholders = re.findall(r'\[.*?\]', input_string)
return len(placeholders) >= num_placeholders
def postscript_checker(input_string: str, postscript_marker: str, **kwargs):
input_string = input_string.lower()
postscript_pattern = r'\s*' + postscript_marker.lower() + r'.*$'
postscript = re.findall(postscript_pattern,
input_string,
flags=re.MULTILINE)
return True if postscript else False
detectable_content_checker = {
'number_placeholders': {
'function': number_placeholders_checker,
'required_lang_code': False,
'num_of_params': 2
},
'postscript': {
'function': postscript_checker,
'required_lang_code': False,
'num_of_params': 2
}
}

View File

@ -0,0 +1,122 @@
import json
import re
def removeprefix(s, prefix):
if s.startswith(prefix):
return s[len(prefix):]
else:
return s
def removesuffix(s, suffix):
if s.endswith(suffix):
return s[:-len(suffix)]
else:
return s
constrained_response = {
'ar': ['إجابتي هي نعم.', 'إجابتي هي لا.', 'إجابتي هي ربما.'],
'es':
['Mi respuesta es sí.', 'Mi respuesta es no.', 'Mi respuesta es tal vez.'],
'fr': [
'Ma réponse est oui.', 'Ma réponse est non.',
'Ma réponse est peut-être.'
],
'ja': ['私の答えははいです。', '私の答えはいいえです。', '私の答えはたぶんです。'],
'ko': ['제 대답은 예입니다.', '제 대답은 아닙니다.', '제 대답은 아마도입니다.'],
'pt': [
'Minha resposta é sim.', 'Minha resposta é não.',
'Minha resposta é talvez.'
],
'th': ['คำตอบของฉันคือใช่', 'คำตอบของฉันคือไม่', 'คำตอบของฉันคืออาจจะ'],
'vi': [
'Câu trả lời của tôi là có.', 'Câu trả lời của tôi là không.',
'Câu trả lời của tôi là có thể.'
],
'en': ['My answer is yes.', 'My answer is no.', 'My answer is maybe.'],
'zh': ['我的答案是是。', '我的答案是否。', '我的答案是不确定。']
}
def constrained_response_checker(input_string: str, lang_code: str, **kwargs):
allowable_responses = constrained_response[lang_code]
return any(response in input_string for response in allowable_responses)
def number_bullet_lists_checker(input_string: str, num_bullets: int, **kwargs):
bullet_lists = re.findall(r'^\s*\*[^\*].*$',
input_string,
flags=re.MULTILINE)
bullet_lists_2 = re.findall(r'^\s*-.*$', input_string, flags=re.MULTILINE)
num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
return num_bullet_lists == num_bullets
def number_highlighted_sections_checker(input_string: str, num_highlights: int,
**kwargs):
temp_num_highlights = 0
highlights = re.findall(r'\*[^\n\*]*\*', input_string)
double_highlights = re.findall(r'\*\*[^\n\*]*\*\*', input_string)
for highlight in highlights:
if highlight.strip('*').strip():
temp_num_highlights += 1
for highlight in double_highlights:
if removesuffix(removeprefix(highlight, '**'), '**').strip():
temp_num_highlights += 1
return temp_num_highlights >= num_highlights
def title_checker(input_string: str, **kwargs):
pattern = r'<<[^\n]+>>'
re_pattern = re.compile(pattern)
titles = re.findall(re_pattern, input_string)
for title in titles:
if title.lstrip('<').rstrip('>').strip():
return True
return False
def json_format_checker(input_string: str, **kwargs):
value = (removesuffix(
removeprefix(
removeprefix(
removeprefix(removeprefix(input_string.strip(), '```json'),
'```Json'), '```JSON'), '```'), '```').strip())
try:
json.loads(value)
except ValueError as e: # noqa F841
return False
return True
detectable_format_checker = {
'constrained_response': {
'function': constrained_response_checker,
'required_lang_code': True,
'num_of_params': 2
},
'json_format': {
'function': json_format_checker,
'required_lang_code': False,
'num_of_params': 1
},
'number_bullet_lists': {
'function': number_bullet_lists_checker,
'required_lang_code': False,
'num_of_parmas': 2
},
'number_highlighted_sections': {
'function': number_highlighted_sections_checker,
'required_lang_code': False,
'num_of_params': 2
},
'title': {
'function': title_checker,
'required_lang_code': False,
'num_of_params': 1
}
}

View File

@ -0,0 +1,12 @@
def forbidden_words_checker(input_string: str, forbidden_words: list,
**kwargs):
return not any(word in input_string for word in forbidden_words)
keywords_checker = {
'forbidden_words': {
'function': forbidden_words_checker,
'required_lang_code': False,
'num_of_params': 2
},
}

View File

@ -0,0 +1,93 @@
import re
def nth_paragraph_first_word_checker(input_string: str, num_paragraphs: int,
nth_paragraph: int, first_word: str,
lang_code: str, **kwargs):
paragraphs = re.split(r'\n\n', input_string)
paragraphs = list(paragraph.strip() for paragraph in paragraphs
if paragraph.strip() != '')
if len(paragraphs) < num_paragraphs:
return False
if len(paragraphs) < nth_paragraph:
return False
paragraph = paragraphs[nth_paragraph - 1].strip()
first_word = ''
if paragraph.lower().startswith(first_word.lower()):
return True
else:
return False
def number_paragraphs_checker(input_string: str, num_paragraphs: int,
**kwargs):
paragraphs = re.split(r'\s?\*\*\*\s?', input_string)
paragraphs = list(paragraph.strip() for paragraph in paragraphs
if paragraph.strip() != '')
return len(paragraphs) == num_paragraphs
def number_sentences_checker(input_string: str, relation: str,
num_sentences: int, lang_code: str, **kwargs):
sentences = list(x.strip() for x in input_string.strip().split('\n'))
sentences = list(x for x in sentences if x != '')
if relation == 'less than':
if len(sentences) <= num_sentences:
return True
else:
return False
elif relation == 'at least':
if len(sentences) >= num_sentences:
return True
else:
return False
def number_words_checker(input_string: str, relation: str, num_words: int,
lang_code: str, **kwargs):
if lang_code in ['en', 'es', 'fr', 'in', 'pt', 'ru', 'vi']:
words = input_string.split()
words = list(x for x in words if x != '')
else:
words = ''.join(input_string.split())
if relation == 'less than':
if len(words) <= num_words:
return True
else:
return False
elif relation == 'at least':
if len(words) >= num_words:
return True
else:
return False
length_constraints_checker = {
'nth_paragraph_first_word': {
'function': nth_paragraph_first_word_checker,
'required_lang_code': True,
'num_of_params': 5
},
'number_paragraphs': {
'function': number_paragraphs_checker,
'required_lang_code': False,
'num_of_params': 2
},
'number_sentences': {
'function': number_sentences_checker,
'required_lang_code': True,
'num_of_params': 3
},
'number_words': {
'function': number_words_checker,
'required_lang_code': True,
'num_of_params': 4
}
}

View File

@ -0,0 +1,30 @@
import re
comma_unicode = {
'ar': re.compile(r'[\u060C]'),
'es': re.compile(r'[,\uFF0C]'),
'fr': re.compile(r'[,\u2026]'),
'ja': re.compile(r'[,\u3001]'),
'ko': re.compile(r'[,]'),
'pt': re.compile(r'[,\uFF0C]'),
'th': re.compile(r'[\u0E25]'),
'vi': re.compile(r'[,\uFF0C]'),
'en': re.compile(r'[,]'),
'zh': re.compile(r'[,]')
}
def no_comma_checker(input_string: str, lang_code: str, **kwargs):
if len(comma_unicode[lang_code].findall(input_string)) > 0:
return False
else:
return True
punctuation_checker = {
'no_comma': {
'function': no_comma_checker,
'required_lang_code': True,
'num_of_params': 2
}
}

View File

@ -0,0 +1,38 @@
def end_checker_checker(input_string: str, end_phrase: str, **kwargs):
if input_string.strip().endswith(end_phrase):
return True
else:
return False
def quotation_checker(input_string: str, lang_code: str, **kwargs):
input_string = input_string.strip()
if input_string.startswith('"') and input_string.endswith('"'):
return True
elif lang_code in [
'ar', 'es', 'fr', 'pt', 'ru'
] and input_string.startswith('«') and input_string.endswith('»'):
return True
elif lang_code in [
'ar', 'es', 'fr', 'ko', 'pt', 'th', 'vi', 'zh'
] and input_string.startswith('') and input_string.endswith(''):
return True
elif lang_code == 'ja' and input_string.startswith(
'') and input_string.endswith(''):
return True
else:
return False
startend_checker = {
'end_checker': {
'function': end_checker_checker,
'required_lang_code': False,
'num_of_params': 2
},
'quotation': {
'function': quotation_checker,
'required_lang_code': True,
'num_of_params': 2
}
}

View File

@ -0,0 +1,152 @@
import json
import os
import re
from typing import Tuple
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
langs_dict = {
'fr': ['La réponse est', 'la réponse est'],
'en': ['the answer is', 'The answer is'],
'vi': ['Câu trả lời là', 'câu trả lời là'],
'ar': ['الجواب هو'],
'th': ['คำตอบคือ'],
'zh': ['答案是'],
'ko': ['답변은'],
'pt': ['A resposta é'],
'ja': ['答えは'],
'es': ['La respuesta es']
}
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
]
for pattern in patterns:
res = re.findall(pattern, gen, flags=re.DOTALL)
if len(res) >= 1:
return res[-1]
else:
res = None
pattern = langs_dict[lang]
for p in pattern:
if p in gen and p != gen:
res = gen.split(p)
if len(res) > 1 and len(res[-1].strip()) > 0:
res = res[-1].strip()[0]
else:
res = None
break
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
if res in temp:
return res
else:
return None
def extract_choice_fuzzy(gen):
options = ['A', 'B', 'C', 'D']
for option in options:
if option in gen:
return option
return None
@TEXT_POSTPROCESSORS.register_module('pmmeval_mlogiqa')
def pmmeval_mlogiqa_postprocess(text: str, lang_code: str) -> Tuple[str]:
return text, lang_code
@LOAD_DATASET.register_module()
class PMMEvalMLogiQADataset(BaseDataset):
@staticmethod
def load(path: str, lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='mlogiqa',
split=f'test/{lang}')
else:
dataset = list()
filename = os.path.join(data_path, f'mlogiqa/test/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalMLogiQAEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
all_results = list()
for (pred, lang), ref in zip(predictions, references):
answer = chr(int(ref) + 65)
pred = extract_choice(pred, lang)
acc = 0
failed_strict = 0
failed = 1
if pred is not None:
failed = 0
if answer.lower() == pred.lower():
acc = 1
else:
acc = 0
else:
pred_fuzzy = extract_choice_fuzzy(pred)
if pred_fuzzy is None:
acc = 0
failed_strict = 1
else:
failed_strict = 0
if answer.lower() == pred_fuzzy.lower():
acc = 1
else:
acc = 0
all_results.append({
'acc':
float(acc),
'failed':
float(failed),
'failed_strict':
float(failed_strict),
'extracted_answer':
pred if pred else 'no answer',
})
final_result = {
'accuracy':
round(
sum(x['acc'] for x in all_results) / len(all_results) * 100,
2),
'details':
all_results
}
return final_result

View File

@ -0,0 +1,157 @@
import json
import os
import re
from typing import Tuple
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
langs_dict = {
'FR-FR': ['La réponse est', 'la réponse est'],
'EN-US': ['the answer is', 'The answer is'],
'VI-VT': ['Câu trả lời là', 'câu trả lời là'],
'AR-XY': ['الجواب هو'],
'TH-TL': ['คำตอบคือ'],
'ZH-CN': ['答案是'],
'KO-KR': ['답변은'],
'PT-BR': ['A resposta é'],
'JA-JP': ['答えは'],
'ES-LA': ['La respuesta es']
}
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
]
for pattern in patterns:
res = re.findall(pattern, gen, flags=re.DOTALL)
if len(res) >= 1:
return res[-1]
else:
res = None
pattern = langs_dict[lang]
for p in pattern:
if p in gen and p != gen:
res = gen.split(p)
if len(res) > 1 and len(res[-1].strip()) > 0:
res = res[-1].strip()[0]
else:
res = None
break
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
if res in temp:
return res
else:
return None
def extract_choice_fuzzy(gen):
options = ['A', 'B', 'C', 'D']
for option in options:
if option in gen:
return option
return None
@TEXT_POSTPROCESSORS.register_module('pmmeval_mmmlu')
def pmmeval_mmmlu_postprocess(text: str, lang_code: str) -> Tuple[str]:
return text, lang_code
@LOAD_DATASET.register_module()
class PMMEvalMMMLUDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str, difficulty: str):
assert difficulty in [
'easy', 'hard', 'all'
], '`difficulty` should be one choice among "easy", "hard", and "all"!'
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
dataset_list = list()
from modelscope import MsDataset
if difficulty == 'easy' or difficulty == 'all':
dataset_list.append(
MsDataset.load(dataset_name=data_path,
subset_name='mmmlu',
split=f'easy/test/mmlu_{lang}'))
if difficulty == 'hard' or difficulty == 'all':
dataset_list.append(
MsDataset.load(dataset_name=data_path,
subset_name='mmmlu',
split=f'hard/test/mmlu_{lang}'))
# TODO: conbine two datasets
dataset = dataset_list[0] + dataset_list[1] if len(
dataset_list) == 2 else dataset_list[0]
else:
dataset = list()
if difficulty == 'easy' or difficulty == 'all':
filename = os.path.join(data_path,
f'mmmlu/easy/test/mmlu_{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
if difficulty == 'hard' or difficulty == 'all':
filename = os.path.join(data_path,
f'mmmlu/hard/test/mmlu_{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalMMMLUEvaluator(BaseEvaluator):
def score(self, predictions, references):
all_results = list()
for (pred, lang), ref in zip(predictions, references):
answer = extract_choice(pred, lang)
if answer is None:
answer = extract_choice_fuzzy(pred)
if answer is None:
acc = 0.0
failed = 1.0
else:
acc = 1.0 if ref.lower() == answer.lower() else 0.0
failed = 0.0
all_results.append({
'acc':
acc,
'failed':
failed,
'extracted_answer':
pred if pred else 'no answer'
})
final_result = {
'accuracy':
round(
sum(x['acc'] for x in all_results) / len(all_results) * 100,
2),
'details':
all_results
}
return final_result

View File

@ -0,0 +1,150 @@
import json
import os
import re
from typing import Tuple
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
langs_dict = {
'fr': ['La réponse est', 'la réponse est'],
'en': ['the answer is', 'The answer is'],
'vi': ['Câu trả lời là', 'câu trả lời là'],
'ar': ['الجواب هو'],
'th': ['คำตอบคือ'],
'zh': ['答案是'],
'ko': ['답변은'],
'pt': ['A resposta é'],
'ja': ['答えは'],
'id': ['Jawaban adalah', 'jawaban adalah'],
'es': ['La respuesta es']
}
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?",
r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]"
]
for pattern in patterns:
res = re.findall(pattern, gen, flags=re.DOTALL)
if len(res) >= 1:
return res[-1]
else:
res = None
pattern = langs_dict[lang]
for p in pattern:
if p in gen and p != gen:
res = gen.split(p)
if len(res) > 1 and len(res[-1].strip()) > 0:
res = res[-1].strip()[0]
else:
res = None
break
temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd']
if res in temp:
return res
else:
return None
def extract_choice_fuzzy(gen, lang):
options = ['A', 'B', 'C', 'D'] # 定义选项
for option in options:
if option in gen: # 检查选项是否在文本中
return option # 返回第一个出现的选项
return None
@TEXT_POSTPROCESSORS.register_module('pmmeval_xnli')
def pmmeval_xnli_postprocess(text: str, lang_code: str) -> Tuple[str]:
return text, lang_code
@LOAD_DATASET.register_module()
class PMMEvalXNLIDataset(BaseDataset):
@staticmethod
def load(path: str, lang: str):
data_path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=data_path,
subset_name='xnli',
split=f'test/{lang}')
else:
dataset = list()
filename = os.path.join(data_path, f'xnli/test/{lang}.jsonl')
with open(filename, mode='r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
dataset.append(line)
dataset = Dataset.from_list(dataset)
return dataset
class PMMEvalXNLIEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
all_results = list()
for (pred, lang), ref in zip(predictions, references):
choice = extract_choice(pred, lang)
acc = 0
failed_strict = 0
failed = 1
if choice is not None:
failed = 0
if ref.lower() == choice.lower():
acc = 1
else:
acc = 0
else:
choice = extract_choice_fuzzy(pred, lang)
if choice is None:
acc = 0
failed_strict = 1
else:
failed_strict = 0
if ref.lower() == choice.lower():
acc = 1
else:
acc = 0
all_results.append({
'acc':
float(acc),
'failed':
float(failed),
'failed_strict':
float(failed_strict),
'extracted_answer':
choice if choice else 'no answer',
})
final_result = {
'accuracy':
round(
sum(x['acc'] for x in all_results) / len(all_results) * 100,
2),
'details':
all_results
}
return final_result

View File

@ -343,6 +343,11 @@ DATASETS_MAPPING = {
"hf_id": "",
"local": "./data/babilong/data/",
},
"P-MMEval": {
"ms_id": "",
"hf_id": "",
"local": "./data/P-MMEval/",
},
"opencompass/arc_prize_public_evaluation": {
"ms_id": "",
"hf_id": "",
@ -530,7 +535,7 @@ DATASETS_URL = {
"/cmo": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
"md5": "fad52c81290506a8ca74f46b5400d8fc",
},
},
"/nq-open": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
"md5": "a340521e5c9ec591227dcb367f718b25",
@ -566,5 +571,9 @@ DATASETS_URL = {
"/arc_prize_public_evaluation": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip",
"md5": "367a33977651496efddba7670009807e"
},
"P-MMEval": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip",
"md5": "589c8be1551a609d94231f1410cf22eb",
}
}