diff --git a/opencompass/configs/datasets/internsandbox/internsandbox_gen.py b/opencompass/configs/datasets/internsandbox/internsandbox_gen.py new file mode 100644 index 00000000..1af0955c --- /dev/null +++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .internsandbox_gen_44b982 import internsandbox_datasets \ No newline at end of file diff --git a/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py b/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py new file mode 100644 index 00000000..368189a5 --- /dev/null +++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py @@ -0,0 +1,59 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import InternSandboxDataset, InternSandboxEvaluator + + +_SANDBOXS_ = ['aquarium', 'arc', 'arrowmaze', 'bbehboardgameqa', 'bbehbooleanexpressions', 'BbehDyckLanguages', 'BbehGeometricShapes', 'BbehMultistepArithmetic', 'bbehobjectcounting', 'bbehobjectproperties', 'bbehshuffobject', 'BbehWebOfLies', 'BbehWordSorting', 'binairo', 'calcudoku', 'campsite', 'cipher', 'cryptomath', 'dominosa', 'futoshiki', 'galaxies', 'game24', 'kakurasu', 'korLogicAnalogicalReasoning', 'korLogicCanonicalPropositions', 'korLogicCooperativePrinciple', 'korLogicDefinitions', 'korLogicDerivativeReasoningOfPropositionalLogic', 'korLogicDisjunctiveNormalFormAndConjunctiveNormalForm', 'korLogicDynamicLogic', 'korLogicEnumerativeInductiveReasoning', 'korLogicEpistemicLogic', 'korLogicEquivalenceCalculus', 'korLogicFigureOfTheSyllogism', 'korLogicFormalFallacies', 'korLogicInductionParadox', 'korLogicLogicalMethodsForExploringCauseAndEffectRelationships', 'korLogicPredicateLogicFormalization', 'korLogicPropositionalLogicConcepts', 'korLogicPropositionalLogicFormalization', 'korLogicResolution', 'korLogicSpeechActs', 'korLogicStatisticalReasoning', 'korLogicTemporalPropositions', 'korLogicTruthValueModalPropositions', 'korOperationUnicode20ac', 'korOperationUnicode2295', 'korOperationUnicode25a0', 'korOperationUnicode25a1', 'korOperationUnicode25b3', 'korOperationUnicode25bd', 'korOperationUnicode25cb', 'korOperationUnicode25ce', 'korOperationUnicode25cf', 'korOperationUnicode2605', 'korOperationUnicodeffe0', 'korOperationUnicodeffe1', 'korPuzzle24Points', 'korPuzzleArrowMaze', 'korPuzzleCalcudoko', 'korPuzzleCampsite', 'korPuzzleConnectWords', 'korPuzzleCryptoMath', 'korPuzzleKukurasu', 'korPuzzleLogicPuzzle', 'korPuzzleSkyscrapers', 'korPuzzleWordBrainTeasers', 'korPuzzleWordLadder', 'korPuzzleWordRootsAndAffixes', 'korPuzzleWordscapes', 'korPuzzleWordSearch', 'LightUp', 'maze', 'minesweeper', 'nonograms', 'starbattle', 'stitches', 'sudoku', 'tents', 'thermometers'] + +internsandbox_reader_cfg = dict( + input_columns=['prompt'], + output_column='ground_truth' +) + +internsandbox_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are a helpful assistant.', + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +internsandbox_eval_cfg = { + sandbox: dict( + evaluator=dict( + type=InternSandboxEvaluator, + short_penalty=False, + format_penalty=False, + ), + pred_role='BOT', + ) for sandbox in _SANDBOXS_ +} + +internsandbox_datasets = [ + dict( + type=InternSandboxDataset, + abbr=f'internsandbox-{sandbox}', + path='./data/InternSandboxBenchmark_verified_V0.3.1/', + local_mode=True, + sandbox=sandbox, + reader_cfg=internsandbox_reader_cfg, + infer_cfg=internsandbox_infer_cfg, + eval_cfg=internsandbox_eval_cfg[sandbox], + ) for sandbox in _SANDBOXS_ +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index b00162d1..a7c037cf 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -68,6 +68,7 @@ from .hungarian_math import * # noqa: F401, F403 from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 from .inference_ppl import InferencePPLDataset # noqa: F401, F403 from .infinitebench import * # noqa: F401, F403 +from .internsandbox import * # noqa: F401, F403 from .iwslt2017 import * # noqa: F401, F403 from .jigsawmultilingual import * # noqa: F401, F403 from .jsonl import JsonlDataset # noqa: F401, F403 diff --git a/opencompass/datasets/internsandbox.py b/opencompass/datasets/internsandbox.py index 1bdb5e2c..c71cc3f7 100644 --- a/opencompass/datasets/internsandbox.py +++ b/opencompass/datasets/internsandbox.py @@ -1,5 +1,12 @@ +import importlib +import json +import os.path as osp + +from datasets import Dataset + from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -8,12 +15,64 @@ from .base import BaseDataset class InternSandboxDataset(BaseDataset): @staticmethod - def load(path: str, local_mode: bool = False): - pass + def load(path: str, sandbox: str, local_mode: bool = False): + path = get_data_path(path, local_mode=local_mode) + file_path = osp.join(path, f'{sandbox}.jsonl') + data = [] + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + origin_data = json.loads(line) + origin_data['ground_truth'] = json.dumps( + origin_data['ground_truth']) + data.append(origin_data) + return Dataset.from_list(data) @ICL_EVALUATORS.register_module() -class MBPPEvaluator(BaseEvaluator): +class InternSandboxEvaluator(BaseEvaluator): + + def __init__(self, + short_penalty: bool = False, + format_penalty: bool = False): + super().__init__() + self.short_penalty = short_penalty + self.format_penalty = format_penalty def score(self, predictions, references, test_set): - pass + + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + class_name = f"{test_set[0]['data_source']}Sandbox" + + details = [] + for pred, ref, ts in zip(predictions, references, test_set): + ref = json.loads(ref) + module = importlib.import_module('intern_sandbox') + score = getattr(module, class_name).verify_score( + pred, + ref, + short_penalty=self.short_penalty, + format_penalty=self.format_penalty) + try: + extracted = getattr(module, class_name).extract_output(pred) + except: # noqa: E722 + extracted = None + + res = { + 'prompt': ts['prompt'], + 'score': score, + 'extracted_output': extracted, + 'ground_truth': ref, + 'output': pred, + } + details.append(res) + + avg_score = sum(r['score'] for r in details) / len(details) + results = {'accuracy': avg_score, 'details': details} + return results