mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Support InternSandbox (#2049)
* internsandbox init * internsandbox * dataset_index * dataset_index_add
This commit is contained in:
parent
43b2c4ed76
commit
ba0e32292c
@ -1023,3 +1023,9 @@
|
|||||||
paper: https://arxiv.org/pdf/2402.09391
|
paper: https://arxiv.org/pdf/2402.09391
|
||||||
configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
|
configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- internsandbox:
|
||||||
|
name: InternSandbox
|
||||||
|
category: Reasoning/Code/Agent
|
||||||
|
paper: ''
|
||||||
|
configpath: opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
|
||||||
|
configpath_llmjudge: ''
|
@ -0,0 +1,4 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .internsandbox_gen_44b982 import internsandbox_datasets
|
@ -0,0 +1,59 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import InternSandboxDataset, InternSandboxEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
_SANDBOXS_ = ['aquarium', 'arc', 'arrowmaze', 'bbehboardgameqa', 'bbehbooleanexpressions', 'BbehDyckLanguages', 'BbehGeometricShapes', 'BbehMultistepArithmetic', 'bbehobjectcounting', 'bbehobjectproperties', 'bbehshuffobject', 'BbehWebOfLies', 'BbehWordSorting', 'binairo', 'calcudoku', 'campsite', 'cipher', 'cryptomath', 'dominosa', 'futoshiki', 'galaxies', 'game24', 'kakurasu', 'korLogicAnalogicalReasoning', 'korLogicCanonicalPropositions', 'korLogicCooperativePrinciple', 'korLogicDefinitions', 'korLogicDerivativeReasoningOfPropositionalLogic', 'korLogicDisjunctiveNormalFormAndConjunctiveNormalForm', 'korLogicDynamicLogic', 'korLogicEnumerativeInductiveReasoning', 'korLogicEpistemicLogic', 'korLogicEquivalenceCalculus', 'korLogicFigureOfTheSyllogism', 'korLogicFormalFallacies', 'korLogicInductionParadox', 'korLogicLogicalMethodsForExploringCauseAndEffectRelationships', 'korLogicPredicateLogicFormalization', 'korLogicPropositionalLogicConcepts', 'korLogicPropositionalLogicFormalization', 'korLogicResolution', 'korLogicSpeechActs', 'korLogicStatisticalReasoning', 'korLogicTemporalPropositions', 'korLogicTruthValueModalPropositions', 'korOperationUnicode20ac', 'korOperationUnicode2295', 'korOperationUnicode25a0', 'korOperationUnicode25a1', 'korOperationUnicode25b3', 'korOperationUnicode25bd', 'korOperationUnicode25cb', 'korOperationUnicode25ce', 'korOperationUnicode25cf', 'korOperationUnicode2605', 'korOperationUnicodeffe0', 'korOperationUnicodeffe1', 'korPuzzle24Points', 'korPuzzleArrowMaze', 'korPuzzleCalcudoko', 'korPuzzleCampsite', 'korPuzzleConnectWords', 'korPuzzleCryptoMath', 'korPuzzleKukurasu', 'korPuzzleLogicPuzzle', 'korPuzzleSkyscrapers', 'korPuzzleWordBrainTeasers', 'korPuzzleWordLadder', 'korPuzzleWordRootsAndAffixes', 'korPuzzleWordscapes', 'korPuzzleWordSearch', 'LightUp', 'maze', 'minesweeper', 'nonograms', 'starbattle', 'stitches', 'sudoku', 'tents', 'thermometers']
|
||||||
|
|
||||||
|
internsandbox_reader_cfg = dict(
|
||||||
|
input_columns=['prompt'],
|
||||||
|
output_column='ground_truth'
|
||||||
|
)
|
||||||
|
|
||||||
|
internsandbox_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt='You are a helpful assistant.',
|
||||||
|
)
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{prompt}'
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer)
|
||||||
|
)
|
||||||
|
|
||||||
|
internsandbox_eval_cfg = {
|
||||||
|
sandbox: dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=InternSandboxEvaluator,
|
||||||
|
short_penalty=False,
|
||||||
|
format_penalty=False,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
) for sandbox in _SANDBOXS_
|
||||||
|
}
|
||||||
|
|
||||||
|
internsandbox_datasets = [
|
||||||
|
dict(
|
||||||
|
type=InternSandboxDataset,
|
||||||
|
abbr=f'internsandbox-{sandbox}',
|
||||||
|
path='./data/InternSandboxBenchmark_verified_V0.3.1/',
|
||||||
|
local_mode=True,
|
||||||
|
sandbox=sandbox,
|
||||||
|
reader_cfg=internsandbox_reader_cfg,
|
||||||
|
infer_cfg=internsandbox_infer_cfg,
|
||||||
|
eval_cfg=internsandbox_eval_cfg[sandbox],
|
||||||
|
) for sandbox in _SANDBOXS_
|
||||||
|
]
|
@ -68,6 +68,7 @@ from .hungarian_math import * # noqa: F401, F403
|
|||||||
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
|
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
|
||||||
from .inference_ppl import InferencePPLDataset # noqa: F401, F403
|
from .inference_ppl import InferencePPLDataset # noqa: F401, F403
|
||||||
from .infinitebench import * # noqa: F401, F403
|
from .infinitebench import * # noqa: F401, F403
|
||||||
|
from .internsandbox import * # noqa: F401, F403
|
||||||
from .iwslt2017 import * # noqa: F401, F403
|
from .iwslt2017 import * # noqa: F401, F403
|
||||||
from .jigsawmultilingual import * # noqa: F401, F403
|
from .jigsawmultilingual import * # noqa: F401, F403
|
||||||
from .jsonl import JsonlDataset # noqa: F401, F403
|
from .jsonl import JsonlDataset # noqa: F401, F403
|
||||||
|
78
opencompass/datasets/internsandbox.py
Normal file
78
opencompass/datasets/internsandbox.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import importlib
|
||||||
|
import json
|
||||||
|
import os.path as osp
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class InternSandboxDataset(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path: str, sandbox: str, local_mode: bool = False):
|
||||||
|
path = get_data_path(path, local_mode=local_mode)
|
||||||
|
file_path = osp.join(path, f'{sandbox}.jsonl')
|
||||||
|
data = []
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
origin_data = json.loads(line)
|
||||||
|
origin_data['ground_truth'] = json.dumps(
|
||||||
|
origin_data['ground_truth'])
|
||||||
|
data.append(origin_data)
|
||||||
|
return Dataset.from_list(data)
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_EVALUATORS.register_module()
|
||||||
|
class InternSandboxEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
short_penalty: bool = False,
|
||||||
|
format_penalty: bool = False):
|
||||||
|
super().__init__()
|
||||||
|
self.short_penalty = short_penalty
|
||||||
|
self.format_penalty = format_penalty
|
||||||
|
|
||||||
|
def score(self, predictions, references, test_set):
|
||||||
|
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error':
|
||||||
|
'predictions and references have different '
|
||||||
|
f'length. len(predictions): {len(predictions)}, '
|
||||||
|
f'len(references): {len(references)}'
|
||||||
|
}
|
||||||
|
|
||||||
|
class_name = f"{test_set[0]['data_source']}Sandbox"
|
||||||
|
|
||||||
|
details = []
|
||||||
|
for pred, ref, ts in zip(predictions, references, test_set):
|
||||||
|
ref = json.loads(ref)
|
||||||
|
module = importlib.import_module('intern_sandbox')
|
||||||
|
score = getattr(module, class_name).verify_score(
|
||||||
|
pred,
|
||||||
|
ref,
|
||||||
|
short_penalty=self.short_penalty,
|
||||||
|
format_penalty=self.format_penalty)
|
||||||
|
try:
|
||||||
|
extracted = getattr(module, class_name).extract_output(pred)
|
||||||
|
except: # noqa: E722
|
||||||
|
extracted = None
|
||||||
|
|
||||||
|
res = {
|
||||||
|
'prompt': ts['prompt'],
|
||||||
|
'score': score,
|
||||||
|
'extracted_output': extracted,
|
||||||
|
'ground_truth': ref,
|
||||||
|
'output': pred,
|
||||||
|
}
|
||||||
|
details.append(res)
|
||||||
|
|
||||||
|
avg_score = sum(r['score'] for r in details) / len(details)
|
||||||
|
results = {'accuracy': avg_score, 'details': details}
|
||||||
|
return results
|
Loading…
Reference in New Issue
Block a user