OpenCompass/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
Dongsheng Zhu 8a5029b121
[Feature] Add MultiPL-E & Code Evaluator (#1963)
* multiple_code develop

* multiple_code update

* comments upadate

* index upadate
2025-03-21 20:09:25 +08:00

57 lines
1.8 KiB
Python

# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
_TOP_TEN_LANGUAGE_ = ['cpp', 'cs', 'go', 'java', 'rb', 'js', 'php', 'r', 'rs', 'sh']
multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
multiple_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
multiple_eval_cfg = {
lang: dict(
evaluator=dict(
type=MultiplEEvaluator,
language=lang,
ip_address='https://opencompass-multiple-evaluator.hf.space',
),
pred_role='BOT',
) for lang in _TOP_TEN_LANGUAGE_
}
multiple_datasets = [
dict(
type=MultiplEDataset,
abbr=f'humaneval-multiple-{lang}',
language=lang,
num_repeats=1,
path='opencompass/multipl_e',
tag='humaneval',
reader_cfg=multiple_reader_cfg,
infer_cfg=multiple_infer_cfg,
eval_cfg=multiple_eval_cfg[lang],
) for lang in _TOP_TEN_LANGUAGE_
]
multiple_datasets += [
dict(
type=MultiplEDataset,
abbr=f'mbpp-multiple-{lang}',
language=lang,
num_repeats=1,
path='opencompass/multipl_e',
tag='mbpp',
reader_cfg=multiple_reader_cfg,
infer_cfg=multiple_infer_cfg,
eval_cfg=multiple_eval_cfg[lang],
) for lang in _TOP_TEN_LANGUAGE_
]