From fff2d51440010e7e50c6b1de59ac0e6b200e8916 Mon Sep 17 00:00:00 2001 From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com> Date: Tue, 4 Mar 2025 18:49:38 +0800 Subject: [PATCH] [Update] Code evaluation alignment (#1909) * code alignment * update oss md5 * bigcodebench update * lint * lint_ * lint yapf --- .../bigcodebench_full_complete_gen_faf748.py | 58 +++--- .../bigcodebench_full_instruct_gen_8815eb.py | 58 +++--- .../bigcodebench_hard_complete_gen_faf748.py | 42 ++-- .../bigcodebench_hard_instruct_gen_8815eb.py | 42 ++-- .../livecodebench_time_split_gen.py | 132 ++++++++++++ .../datasets/bigcodebench/bigcodebench.py | 14 +- .../datasets/livecodebench/evaluator.py | 46 +++- .../datasets/livecodebench/livecodebench.py | 17 +- opencompass/utils/datasets_info.py | 196 ++++++++++++------ 9 files changed, 405 insertions(+), 200 deletions(-) create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py index f2edc098..6ae8a218 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py @@ -1,53 +1,43 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import ( - BigCodeBenchDataset, - BigCodeBenchEvaluator -) - +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) bigcodebench_full_reader_cfg = dict( - input_columns=['complete_prompt'], - output_column='test', + input_columns=['complete_prompt'], + output_column='test', ) - -bigcodebench_full_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[dict(role='system', - fallback_role='HUMAN', - prompt='')], - round=[ - dict(role='HUMAN', prompt='{complete_prompt}'), - ] - ) - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=1024) -) +bigcodebench_full_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{complete_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, + max_out_len=1024)) bigcodebench_full_eval_cfg = dict( evaluator=dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='complete', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='full', ), pred_role='BOT', ) bigcodebench_full_complete_datasets = [ - dict( - abbr='bigcodebench_full_complete', - type=BigCodeBenchDataset, - path='opencompass/bigcodebench', - reader_cfg=bigcodebench_full_reader_cfg, - infer_cfg=bigcodebench_full_infer_cfg, - eval_cfg=bigcodebench_full_eval_cfg, - release_version='v0.1.2' - ) -] \ No newline at end of file + dict(abbr='bigcodebench_full_complete', + type=BigCodeBenchDataset, + path='opencompass/bigcodebench', + reader_cfg=bigcodebench_full_reader_cfg, + infer_cfg=bigcodebench_full_infer_cfg, + eval_cfg=bigcodebench_full_eval_cfg, + release_version='v0.1.2') +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py index 88b410ae..eed4d04d 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py @@ -1,53 +1,43 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import ( - BigCodeBenchDataset, - BigCodeBenchEvaluator -) - +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) bigcodebench_full_reader_cfg = dict( - input_columns=['instruct_prompt'], - output_column='test', + input_columns=['instruct_prompt'], + output_column='test', ) - -bigcodebench_full_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[dict(role='system', - fallback_role='HUMAN', - prompt='')], - round=[ - dict(role='HUMAN', prompt='{instruct_prompt}'), - ] - ) - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=8192) -) +bigcodebench_full_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{instruct_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, + max_out_len=8192)) bigcodebench_full_eval_cfg = dict( evaluator=dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='instruct', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='full', ), pred_role='BOT', ) bigcodebench_full_instruct_datasets = [ - dict( - abbr='bigcodebench_full_instruct', - type=BigCodeBenchDataset, - path='opencompass/bigcodebench', - reader_cfg=bigcodebench_full_reader_cfg, - infer_cfg=bigcodebench_full_infer_cfg, - eval_cfg=bigcodebench_full_eval_cfg, - release_version='v0.1.2' - ) -] \ No newline at end of file + dict(abbr='bigcodebench_full_instruct', + type=BigCodeBenchDataset, + path='opencompass/bigcodebench', + reader_cfg=bigcodebench_full_reader_cfg, + infer_cfg=bigcodebench_full_infer_cfg, + eval_cfg=bigcodebench_full_eval_cfg, + release_version='v0.1.2') +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py index c0419774..c411f411 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py @@ -1,40 +1,32 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import ( - BigCodeBenchDataset, - BigCodeBenchEvaluator -) - +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) bigcodebench_hard_reader_cfg = dict( - input_columns=['complete_prompt'], - output_column='test', + input_columns=['complete_prompt'], + output_column='test', ) - -bigcodebench_hard_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[dict(role='system', - fallback_role='HUMAN', - prompt='')], - round=[ - dict(role='HUMAN', prompt='{complete_prompt}'), - ] - ) - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=1024) -) +bigcodebench_hard_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{complete_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, + max_out_len=1024)) bigcodebench_hard_eval_cfg = dict( evaluator=dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='complete', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='hard', ), pred_role='BOT', @@ -51,4 +43,4 @@ bigcodebench_hard_complete_datasets = [ release_version='v0.1.2', dataset_version='hard', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py index 3d1cc82c..7187041e 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py @@ -1,40 +1,32 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import ( - BigCodeBenchDataset, - BigCodeBenchEvaluator -) - +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) bigcodebench_hard_reader_cfg = dict( - input_columns=['instruct_prompt'], - output_column='test', + input_columns=['instruct_prompt'], + output_column='test', ) - -bigcodebench_hard_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[dict(role='system', - fallback_role='HUMAN', - prompt='')], - round=[ - dict(role='HUMAN', prompt='{instruct_prompt}'), - ] - ) - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=8192) -) +bigcodebench_hard_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{instruct_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, + max_out_len=8192)) bigcodebench_hard_eval_cfg = dict( evaluator=dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='instruct', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='hard', ), pred_role='BOT', @@ -51,4 +43,4 @@ bigcodebench_hard_instruct_datasets = [ release_version='v0.1.2', dataset_version='hard', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py new file mode 100644 index 00000000..89bd9eb1 --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py @@ -0,0 +1,132 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import (LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator) + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict(type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v5', + start_date='2024-08-01', + end_date='2025-02-01'), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v5', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt= + 'You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501 + ), + ], + round=[dict(role='HUMAN', prompt='{prompt}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict(type=LCBCodeExecutionEvaluator, ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[dict(role='HUMAN', prompt='{prompt}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_test_output_eval_cfg = dict( + evaluator=dict(type=LCBTestOutputEvaluator, ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index f347e9e2..9ce3d196 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -197,11 +197,21 @@ class BigCodeBenchEvaluator(BaseEvaluator): break except (httpx.ReadTimeout, CancelledError): logger.info('Read timeout error. Retrying in 4s...') - time.sleep(4) + time.sleep(10) if 'pass@1' in pass_at_k.keys(): pass_at_k['pass@1'] *= 100 - dump_results = {'details': results} + dump_results = {'details': self._results_processor(results)} dump_results.update(pass_at_k) return dump_results + + def _results_processor(self, results): + details = [] + for key, value in results['eval'].items(): + if value[0]['status'] == 'pass': + value[0]['correct'] = True + else: + value[0]['correct'] = False + details.append(value[0]) + return details diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py index e9fb70d7..65867d47 100644 --- a/opencompass/datasets/livecodebench/evaluator.py +++ b/opencompass/datasets/livecodebench/evaluator.py @@ -146,9 +146,12 @@ def evaluate_generations( with ProcessPoolExecutor( max_workers=1 if debug else num_process_evaluate) as executor: futures = { - executor.submit(evaluate_generations_by_problem, - problem_generations, sample, debug, timeout): - index + executor.submit( + evaluate_generations_by_problem, # noqa: E501 + problem_generations, + sample, + debug, + timeout): index for (problem_generations, sample, debug, timeout), index in inputs } @@ -233,15 +236,27 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): num_process_evaluate, timeout=6, release_version='release_v1', - extractor_version='v1'): + extractor_version='v1', + start_date=None, + end_date=None): super().__init__() self.num_process_evaluate = num_process_evaluate self.timeout = timeout self.dataset = LCBCodeGenerationDataset.load( - release_version=release_version)['test'] + release_version=release_version, + start_date=start_date, + end_date=end_date)['test'] self.extractor_version = extractor_version def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + if self.extractor_version == 'v1': predictions = [[extract_code_generation(item)] for item in predictions] @@ -254,19 +269,28 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): evaluation_samples[self.dataset[idx][ 'question_id']] = self.dataset[idx]['evaluation_sample'] - references = [evaluation_samples[item] for item in references] + filtered_predictions = [] + filtered_references = [] + for idx, item in enumerate(references): + if item in self.dataset['question_id']: + filtered_predictions.append(predictions[idx]) + filtered_references.append(item) - references = [{'input_output': item} for item in references] + filtered_references = [ + evaluation_samples[item] for item in filtered_references + ] # noqa: E501 - BaseEvaluator.is_num_equal(predictions, references) + filtered_references = [{ + 'input_output': item + } for item in filtered_references] # noqa: E501 extracted_predictions = {} - for idx, content in enumerate(predictions): + for idx, content in enumerate(filtered_predictions): extracted_predictions[idx] = content metrics, eval_results, final_metadata = codegen_metrics( - references, - predictions, + filtered_references, + filtered_predictions, k_list=[1], num_process_evaluate=self.num_process_evaluate, timeout=self.timeout, diff --git a/opencompass/datasets/livecodebench/livecodebench.py b/opencompass/datasets/livecodebench/livecodebench.py index dbd76d71..9ad3f84c 100644 --- a/opencompass/datasets/livecodebench/livecodebench.py +++ b/opencompass/datasets/livecodebench/livecodebench.py @@ -6,6 +6,7 @@ import json import pickle import zlib from dataclasses import dataclass +from datetime import datetime from enum import Enum from datasets import DatasetDict, load_dataset, load_from_disk @@ -53,7 +54,9 @@ class LCBCodeGenerationDataset(BaseDataset): @staticmethod def load(path: str = 'opencompass/code_generation_lite', local_mode: bool = False, - release_version: str = 'release_v1'): + release_version: str = 'release_v1', + start_date: str = None, + end_date: str = None): def transform(item): # Define the dataitem mapping logic @@ -61,7 +64,7 @@ class LCBCodeGenerationDataset(BaseDataset): # starter_code if item['starter_code']: format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n' # noqa: E501 - format_prompt += f"```python\n{item['starter_code']}\n```\n\n" + format_prompt += f"```python\n{item['starter_code']}\n```\n\n" # noqa: Q000, E501 else: format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n' # noqa: E501 format_prompt += '```python\n# YOUR CODE HERE\n```\n\n' @@ -107,6 +110,16 @@ class LCBCodeGenerationDataset(BaseDataset): dataset = dataset.map(transform) + if start_date is not None: + p_start_date = datetime.strptime(start_date, '%Y-%m-%d') + dataset = dataset.filter( + lambda e: p_start_date <= datetime.fromisoformat(e[ + 'contest_date'])) # noqa: E501 + if end_date is not None: + p_end_date = datetime.strptime(end_date, '%Y-%m-%d') + dataset = dataset.filter(lambda e: datetime.fromisoformat(e[ + 'contest_date']) <= p_end_date) # noqa: E501 + return DatasetDict({'test': dataset, 'train': dataset}) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 050d5983..79be5736 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -376,7 +376,7 @@ DATASETS_MAPPING = { "opencompass/LiveReasonBench": { "ms_id": "", "hf_id": "", - "local": "./data/LiveReasonBench/", + "local": "./data/LiveReasonBench/", }, "opencompass/bigcodebench": { "ms_id": "", @@ -407,251 +407,313 @@ DATASETS_MAPPING = { DATASETS_URL = { "/OlympiadBench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip", "md5": "97e8b1ae7f6170d94817288a8930ef00", }, - "/longbenchv2":{ - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip", + "/longbenchv2": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip", "md5": "09b7e06e6f98c5cca8ad597b3d7b42f0", }, "/livestembench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip", "md5": "0ff59d031c3dcff56a2e00e8c1489f5d", }, "/musr": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip", "md5": "7447d2a5bec4586035196102135e2af9", }, "/mmlu/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip", "md5": "761310671509a239e41c4b717f7fab9c", }, "/mmmlu_lite": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip", "md5": "a776af1220e1826fd0608eda1bc4425e", }, "/simpleqa": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip", "md5": "1d83fc2e15798d39cb265c9a3cb5195a", }, "/chinese_simpleqa": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip", "md5": "4bdf854b291fc0ee29da57dc47ac47b5", }, "/gpqa/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip", "md5": "2e9657959030a765916f1f2aca29140d", }, "/CHARM/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip", "md5": "fdf51e955d1b8e0bb35bc1997eaf37cb", }, "/ifeval/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip", "md5": "64d98b6f36b42e7390c9cef76cace75f", }, "/mbpp/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip", "md5": "777739c90f04bce44096a5bc96c8f9e5", }, "/cmmlu/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip", "md5": "a59f4003d6918509a719ce3bc2a5d5bc", }, "/math/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip", "md5": "cb5b4c8378085929e20345174e731fdf", }, "/hellaswag/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip", "md5": "2b700a02ffb58571c7df8d8d0619256f", }, "/BBH/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip", "md5": "60c49f9bef5148aa7e1941328e96a554", }, "/compass_arena/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip", "md5": "cd59b54a179d16f2a858b359b60588f6", }, "/TheoremQA/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip", "md5": "f2793b07bc26510d507aa710d9bd8622", }, "/mathbench_v1/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip", "md5": "50257a910ca43d1f61a610a79fdb16b5", }, "/gsm8k/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip", "md5": "901e5dc93a2889789a469da9850cdca8", }, "/LCBench2023/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip", "md5": "e1a38c94a42ad1809e9e0650476a9306", }, "/humaneval/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip", "md5": "88b1b89dc47b7121c81da6bcd85a69c3", }, "/humanevalx": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip", "md5": "22930355c03fb73fb5bae14b50f1deb9", }, "/ds1000_data": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip", "md5": "1a4990aec04a2fd73ccfad12e2d43b43", }, "/drop_simple_eval/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip", "md5": "c912afe5b4a63509851cf16e6b91830e", }, "subjective/alignment_bench/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip", "md5": "d8ae9a0398526479dbbcdb80fafabceb", }, "subjective/alpaca_eval": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip", "md5": "d7399d63cb46c82f089447160ef49b6a", }, "subjective/arena_hard": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip", "md5": "02cd09a482cb0f0cd9d2c2afe7a1697f", }, "subjective/mtbench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip", "md5": "d1afc0787aeac7f1f24872742e161069", }, "subjective/fofo": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip", "md5": "8a302712e425e27e4292a9369df5b9d3", }, "subjective/followbench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip", "md5": "da7a831817c969da15d1e78d4a245d8a", }, "subjective/mtbench101": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip", "md5": "5d80257bc9929ebe5cfbf6d11184b04c", }, "subjective/WildBench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip", "md5": "b06252857f1f8f44a17b1bfca4888ff4", }, "/ruler/": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", "md5": "c60bdfff3d02358067104cc1dea7c0f7", }, "/scicode": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", "md5": "9c6c64b8c70edc418f713419ea39989c", }, "/commonsenseqa": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip", "md5": "c4a82fc07c81ae1462605f5d7fd2bb2e", }, "FewCLUE": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip", "md5": "7976e2bb0e9d885ffd3c55f7c5d4021e", }, "/race": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip", "md5": "b758251764a264746cf45749c02363f9", }, "/ARC": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip", "md5": "d720629b69f1a51cfe78bf65b00b44f6", }, "/SuperGLUE": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip", "md5": "b60904915b0b61d1a04ea52280169936", }, "SQuAD2.0": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip", "md5": "1321cbf9349e1102a57d31d1b2bfdd7e", }, "mmlu_pro": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip", "md5": "e3200c7380f4cea5f13c768f2815fabb", }, "/Longbench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip", "md5": "ab0cb9e520ae5cfb899bf38b564249bb", }, "/needlebench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip", "md5": "dad5c903ebfea16eaf186b8997aeedad", }, "/teval": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip", "md5": "7628ab5891a26bf96ca17becfd044867", }, "/code_generation_lite": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip", - "md5": "60103a18ca63b05ea06e98d24170f23d", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip", + "md5": "ebcf8db56f5c817ca8202a542be30cb4", }, "/execution-v2": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip", "md5": "019ef1a0686ee6ca34f51c8af104fcd9", }, "/test_generation": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip", "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7", }, "/aime": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", "md5": "fbe2d0577fc210962a549f8cea1a00c8", }, "/cmo": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", "md5": "fad52c81290506a8ca74f46b5400d8fc", - }, + }, "/nq-open": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", "md5": "a340521e5c9ec591227dcb367f718b25", }, "/winogrande": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip", "md5": "9e949a75eacc26ed4fd2b9aa870b495b", }, "/triviaqa": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip", "md5": "e6a118d744236814926b2ec7ec66c034", }, "/GAOKAO-BENCH": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip", "md5": "ba3c71b8b9db96d2a0664b977c4f9784", }, "/WikiBench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", "md5": "6dac1d1a3133fe1effff185cbf71d928", }, "/babilong": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", "md5": "e400864c31bc58d29eaa3e199751f99b", }, "/korbench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", "md5": "9107597d137e7362eaf7d218ddef7a6d", }, "subjective/judgerbench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", "md5": "60d605883aa8cac9755819140ab42c6b" }, "/arc_prize_public_evaluation": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", "md5": "367a33977651496efddba7670009807e" }, "P-MMEval": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", "md5": "09e401e6229a50647b9e13c429e634d1", }, "LiveMathBench": { - 'url': "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip", + 'url': + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip", "md5": "d0781f9185c9bb50e81e6e3ca8c59013", }, "bigcodebench": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", - "md5": "2c1c7956ca49a1124617e8c037ec57d8" + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", + "md5": "270f399f4142b74f47ecff116cc3b21d" } }