From 14b4b735cb3e4ed0aa61be3db4862c75bd4974cb Mon Sep 17 00:00:00 2001 From: Hari Seldon <95674173+HariSeldon0@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:42:25 +0800 Subject: [PATCH] [Feature] Add support for SciCode (#1417) * add SciCode * add SciCode * add SciCode * add SciCode * add SciCode * add SciCode * add SciCode * add SciCode w/ bg * add scicode * Update README.md * Update README.md * Delete configs/eval_SciCode.py * rename * 1 * rename * Update README.md * Update scicode.py * Update scicode.py * fix some bugs * Update * Update --------- Co-authored-by: root Co-authored-by: tonysy --- README.md | 1 + README_zh-CN.md | 1 + configs/datasets/scicode/README.md | 31 ++ configs/datasets/scicode/scicode_gen.py | 4 + .../datasets/scicode/scicode_gen_085b98.py | 29 ++ .../configs/datasets/scicode/README.md | 31 ++ .../configs/datasets/scicode/scicode_gen.py | 4 + .../datasets/scicode/scicode_gen_085b98.py | 29 ++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/scicode.py | 349 ++++++++++++++++++ .../icl_inferencer/icl_chat_inferencer.py | 2 +- opencompass/utils/datasets_info.py | 4 + requirements/runtime.txt | 1 + 13 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 configs/datasets/scicode/README.md create mode 100644 configs/datasets/scicode/scicode_gen.py create mode 100644 configs/datasets/scicode/scicode_gen_085b98.py create mode 100644 opencompass/configs/datasets/scicode/README.md create mode 100644 opencompass/configs/datasets/scicode/scicode_gen.py create mode 100644 opencompass/configs/datasets/scicode/scicode_gen_085b98.py create mode 100644 opencompass/datasets/scicode.py diff --git a/README.md b/README.md index ccee8471..df9c4dd4 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥 - **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥 - **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index 12ed9d94..c4a7c6d2 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -69,6 +69,7 @@ ## 🚀 最新进展 +- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥 - **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型,欢迎试用!🔥🔥🔥 - **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集,您可以按需加载,无需事先下载全部数据到本地,欢迎试用!🔥🔥🔥 diff --git a/configs/datasets/scicode/README.md b/configs/datasets/scicode/README.md new file mode 100644 index 00000000..61a4fb13 --- /dev/null +++ b/configs/datasets/scicode/README.md @@ -0,0 +1,31 @@ +# SciCode: A Research Coding Benchmark Curated by Scientists + +## Introduction +SciCode is a challenging benchmark designed to evaluate the capabilities of language models (LMs) in generating code for solving realistic scientific research problems. It has a diverse coverage of 16 subdomains from 6 domains: Physics, Math, Material Science, Biology, and Chemistry. Unlike previous benchmarks that consist of exam-like question-answer pairs, SciCode is converted from real research problems. SciCode problems naturally factorize into multiple subproblems, each involving knowledge recall, reasoning, and code synthesis. In total, SciCode contains 338 subproblems decomposed from 80 challenging main problems, and it offers optional descriptions specifying useful scientific background information and scientist-annotated gold-standard solutions and test cases for evaluation. Claude3.5-Sonnet, the best-performing model among those tested, can solve only 4.6% of the problems in the most realistic setting. Broadly, SciCode demonstrates a realistic and scientists' everyday workflow of identifying critical science concepts and facts and then transforming them into computation and simulation code. We believe SciCode not only helps demonstrate contemporary LLMs' progress towards helpful assistant for scientists but also helps shed light on future building and evaluation of scientific AI. For more detailed information, please refer to https://scicode-bench.github.io/. + +## How to Use +By modifying the with_bg parameter in the configuration file, you can support setup for w/ background evaluation. + +```bash +python run.py --datasets scicode_gen --hf-num-gpus 1 --hf-type chat --hf-path meta-llama/Meta-Llama-3-8B-Instruct --debug --model-kwargs device_map='auto' trust_remote_code=True --batch-size 1 +``` + +## Reference Performance +| Model | Condition | Subproblem Accuracy | Main Problem Accuracy | +|---------------------------|--------------|---------------------|-----------------------| +| Llama-3-70B-Instruct | w/o Background | 21.53% | 4.62% | +| Llama-3-70B-Instruct | w/ Background | 24.31% | 7.69% | +| Qwen2-72B-Instruct | w/o Background | 16.67% | 1.54% | +| Qwen2-72B-Instruct | w/ Background | 19.79% | 1.54% | + +## Citation +``` +@misc{tian2024scicode, + title={SciCode: A Research Coding Benchmark Curated by Scientists}, + author={Minyang Tian and Luyu Gao and Shizhuo Dylan Zhang and Xinan Chen and Cunwei Fan and Xuefei Guo and Roland Haas and Pan Ji and Kittithat Krongchon and Yao Li and Shengyan Liu and Di Luo and Yutao Ma and Hao Tong and Kha Trinh and Chenyu Tian and Zihan Wang and Bohao Wu and Yanyu Xiong and Shengzhu Yin and Minhui Zhu and Kilian Lieret and Yanxin Lu and Genglin Liu and Yufeng Du and Tianhua Tao and Ofir Press and Jamie Callan and Eliu Huerta and Hao Peng}, + year={2024}, + eprint={2407.13168}, + archivePrefix={arXiv}, + primaryClass={cs.AI} +} +``` diff --git a/configs/datasets/scicode/scicode_gen.py b/configs/datasets/scicode/scicode_gen.py new file mode 100644 index 00000000..7cbc69fa --- /dev/null +++ b/configs/datasets/scicode/scicode_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .scicode_gen_085b98 import SciCode_datasets # noqa: F401, F403 diff --git a/configs/datasets/scicode/scicode_gen_085b98.py b/configs/datasets/scicode/scicode_gen_085b98.py new file mode 100644 index 00000000..10ed5e6b --- /dev/null +++ b/configs/datasets/scicode/scicode_gen_085b98.py @@ -0,0 +1,29 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import ChatInferencer +from opencompass.datasets import SciCodeDataset, SciCodeEvaluator + + +SciCode_reader_cfg = dict(input_columns=['prompt'], output_column=None) + +SciCode_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template='', + ), + + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=ChatInferencer, infer_mode='every', max_out_len=4096)) + +SciCode_eval_cfg = dict(evaluator=dict(type=SciCodeEvaluator, dataset_path='./data/scicode', with_bg=False)) + +SciCode_datasets = [ + dict( + abbr='SciCode', + type=SciCodeDataset, + path='./data/scicode', + with_bg=False, + reader_cfg=SciCode_reader_cfg, + infer_cfg=SciCode_infer_cfg, + eval_cfg=SciCode_eval_cfg) +] diff --git a/opencompass/configs/datasets/scicode/README.md b/opencompass/configs/datasets/scicode/README.md new file mode 100644 index 00000000..61a4fb13 --- /dev/null +++ b/opencompass/configs/datasets/scicode/README.md @@ -0,0 +1,31 @@ +# SciCode: A Research Coding Benchmark Curated by Scientists + +## Introduction +SciCode is a challenging benchmark designed to evaluate the capabilities of language models (LMs) in generating code for solving realistic scientific research problems. It has a diverse coverage of 16 subdomains from 6 domains: Physics, Math, Material Science, Biology, and Chemistry. Unlike previous benchmarks that consist of exam-like question-answer pairs, SciCode is converted from real research problems. SciCode problems naturally factorize into multiple subproblems, each involving knowledge recall, reasoning, and code synthesis. In total, SciCode contains 338 subproblems decomposed from 80 challenging main problems, and it offers optional descriptions specifying useful scientific background information and scientist-annotated gold-standard solutions and test cases for evaluation. Claude3.5-Sonnet, the best-performing model among those tested, can solve only 4.6% of the problems in the most realistic setting. Broadly, SciCode demonstrates a realistic and scientists' everyday workflow of identifying critical science concepts and facts and then transforming them into computation and simulation code. We believe SciCode not only helps demonstrate contemporary LLMs' progress towards helpful assistant for scientists but also helps shed light on future building and evaluation of scientific AI. For more detailed information, please refer to https://scicode-bench.github.io/. + +## How to Use +By modifying the with_bg parameter in the configuration file, you can support setup for w/ background evaluation. + +```bash +python run.py --datasets scicode_gen --hf-num-gpus 1 --hf-type chat --hf-path meta-llama/Meta-Llama-3-8B-Instruct --debug --model-kwargs device_map='auto' trust_remote_code=True --batch-size 1 +``` + +## Reference Performance +| Model | Condition | Subproblem Accuracy | Main Problem Accuracy | +|---------------------------|--------------|---------------------|-----------------------| +| Llama-3-70B-Instruct | w/o Background | 21.53% | 4.62% | +| Llama-3-70B-Instruct | w/ Background | 24.31% | 7.69% | +| Qwen2-72B-Instruct | w/o Background | 16.67% | 1.54% | +| Qwen2-72B-Instruct | w/ Background | 19.79% | 1.54% | + +## Citation +``` +@misc{tian2024scicode, + title={SciCode: A Research Coding Benchmark Curated by Scientists}, + author={Minyang Tian and Luyu Gao and Shizhuo Dylan Zhang and Xinan Chen and Cunwei Fan and Xuefei Guo and Roland Haas and Pan Ji and Kittithat Krongchon and Yao Li and Shengyan Liu and Di Luo and Yutao Ma and Hao Tong and Kha Trinh and Chenyu Tian and Zihan Wang and Bohao Wu and Yanyu Xiong and Shengzhu Yin and Minhui Zhu and Kilian Lieret and Yanxin Lu and Genglin Liu and Yufeng Du and Tianhua Tao and Ofir Press and Jamie Callan and Eliu Huerta and Hao Peng}, + year={2024}, + eprint={2407.13168}, + archivePrefix={arXiv}, + primaryClass={cs.AI} +} +``` diff --git a/opencompass/configs/datasets/scicode/scicode_gen.py b/opencompass/configs/datasets/scicode/scicode_gen.py new file mode 100644 index 00000000..7cbc69fa --- /dev/null +++ b/opencompass/configs/datasets/scicode/scicode_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .scicode_gen_085b98 import SciCode_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/scicode/scicode_gen_085b98.py b/opencompass/configs/datasets/scicode/scicode_gen_085b98.py new file mode 100644 index 00000000..10ed5e6b --- /dev/null +++ b/opencompass/configs/datasets/scicode/scicode_gen_085b98.py @@ -0,0 +1,29 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import ChatInferencer +from opencompass.datasets import SciCodeDataset, SciCodeEvaluator + + +SciCode_reader_cfg = dict(input_columns=['prompt'], output_column=None) + +SciCode_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template='', + ), + + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=ChatInferencer, infer_mode='every', max_out_len=4096)) + +SciCode_eval_cfg = dict(evaluator=dict(type=SciCodeEvaluator, dataset_path='./data/scicode', with_bg=False)) + +SciCode_datasets = [ + dict( + abbr='SciCode', + type=SciCodeDataset, + path='./data/scicode', + with_bg=False, + reader_cfg=SciCode_reader_cfg, + infer_cfg=SciCode_infer_cfg, + eval_cfg=SciCode_eval_cfg) +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 21f5c6f9..a1f201ef 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -99,6 +99,7 @@ from .record import * # noqa: F401, F403 from .ruler import * # noqa: F401, F403 from .safety import * # noqa: F401, F403 from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403 +from .scicode import * # noqa: F401, F403 from .siqa import * # noqa: F401, F403 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403 from .storycloze import * # noqa: F401, F403 diff --git a/opencompass/datasets/scicode.py b/opencompass/datasets/scicode.py new file mode 100644 index 00000000..00b0cd84 --- /dev/null +++ b/opencompass/datasets/scicode.py @@ -0,0 +1,349 @@ +import json +import os +import os.path as osp +import re +import subprocess + +import h5py +import numpy as np +import scipy +import scipy.sparse +import sympy +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class SciCodeDataset(BaseDataset): + + @staticmethod + def load(path, with_bg, **kwargs): + test_data = [] + path = get_data_path(path, local_mode=True) + if with_bg: # test with background + file_path = osp.join(path, 'SciCode_datasets_with_background.json') + else: # test w/o background + file_path = osp.join(path, 'SciCode_datasets.json') + + with open(file_path, 'r', encoding='utf-8') as file: + test_data = json.load(file) + + dataset = Dataset.from_list(test_data) + return dataset + + def return_dataset(self): + return self.dataset + + +H5PY_FILE_FOLDER = './data/scicode/' +H5PY_FILE_FOLDER = get_data_path(H5PY_FILE_FOLDER, local_mode=True) + + +def process_hdf5_list(group): + lst = [] + for key in group.keys(): + lst.append(group[key][()]) + return lst + + +def process_hdf5_dict(group): + dict = {} + for key, obj in group.items(): + if isinstance(obj, h5py.Group): + dict[key] = process_hdf5_sparse_matrix(obj['sparse_matrix']) + elif isinstance(obj[()], bytes): + dict[key] = obj[()].decode('utf-8', errors='strict') + else: + try: + tmp = float(key) + dict[tmp] = obj[()] + except ValueError: + dict[key] = obj[()] + return dict + + +def process_hdf5_sparse_matrix(group): + data = group['data'][()] + shape = tuple(group['shape'][()]) + if 'row' in group and 'col' in group: + row = group['row'][()] + col = group['col'][()] + return scipy.sparse.coo_matrix((data, (row, col)), shape=shape) + elif 'blocksize' in group: + indices = group['indices'][()] + indptr = group['indptr'][()] + blocksize = tuple(group['blocksize'][()]) + return scipy.sparse.bsr_matrix((data, indices, indptr), + shape=shape, + blocksize=blocksize) + else: + indices = group['indices'][()] + indptr = group['indptr'][()] + return scipy.sparse.csr_matrix((data, indices, indptr), shape=shape) + + +def process_hdf5_datagroup(group): + for key in group.keys(): + if key == 'list': + return process_hdf5_list(group[key]) + if key == 'sparse_matrix': + return process_hdf5_sparse_matrix(group[key]) + else: + return process_hdf5_dict(group) + + +def process_hdf5_to_tuple(step_id, test_num): + data_lst = [] + H5PY_FILE = os.path.join(H5PY_FILE_FOLDER, 'test_data.h5') + assert os.path.exists( + H5PY_FILE + ), f"Please manually download 'test_data.h5' from https://github.com/open-compass/storage/releases/download/v0.1.0/scicode_test_data.zip and put the file in {H5PY_FILE}" # noqa: E501 + + with h5py.File(H5PY_FILE, 'r') as f: + for test_id in range(test_num): + group_path = f'{step_id}/test{test_id + 1}' + if isinstance(f[group_path], h5py.Group): + group = f[group_path] # test1, test2, test3 + num_keys = [key for key in group.keys()] + if len(num_keys) == 1: # only 1 var in the test + subgroup = group[num_keys[0]] + if isinstance(subgroup, h5py.Dataset): + if isinstance(subgroup[()], bytes): + data_lst.append(subgroup[()].decode( + 'utf-8', errors='strict')) + else: + data_lst.append(subgroup[()]) + elif isinstance(subgroup, h5py.Group): + data_lst.append(process_hdf5_datagroup(subgroup)) + else: + var_lst = [] + for key in group.keys(): # var1, var2, var3 + subgroup = group[key] + if isinstance(subgroup, h5py.Dataset): + if isinstance(subgroup[()], bytes): + var_lst.append(subgroup[()].decode( + 'utf-8', errors='strict')) + else: + var_lst.append(subgroup[()]) + elif isinstance(subgroup, h5py.Group): + var_lst.append(process_hdf5_datagroup(subgroup)) + data_lst.append(tuple(var_lst)) + else: + raise FileNotFoundError( + f'Path {group_path} not found in the file.') + return data_lst + + +def are_dicts_close(dict1, dict2, atol=1e-8, rtol=1e-5): + dict1 = process_symbol_in_dict(dict1) + dict2 = process_symbol_in_dict(dict2) + # Check if both dictionaries have the same keys + if dict1.keys() != dict2.keys(): + return False + + # Check if the corresponding values are close + for key in dict1: + value1 = dict1[key] + value2 = dict2[key] + if isinstance(value1, (sympy.Symbol, str)): + if not value1 == value2: + return False + elif isinstance(value1, + (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix, + scipy.sparse.bsr_matrix, scipy.sparse.coo_matrix)): + value1 = value1.toarray() + value2 = value2.toarray() + if not np.allclose(value1, value2, atol=atol, rtol=rtol): + return False + # Use np.allclose to compare values + else: + try: + if not np.allclose(value1, value2, atol=atol, rtol=rtol): + return False + except ValueError: + if not value1 == value2: + return False + + return True + + +def process_symbol_in_dict(dict): + new_dict = {} + for key, value in dict.items(): + new_dict[key] = value + if isinstance(value, sympy.Symbol): + new_dict[key] = str(value) + if isinstance(key, sympy.Symbol): + new_dict[str(key)] = dict[key] + new_dict.pop(key) + return new_dict + + +def are_csc_matrix_close(matrix1, matrix2): + dense1 = matrix1.toarray() + dense2 = matrix2.toarray() + return np.allclose(dense1, dense2) + + +def cmp_tuple_or_list(var1, var2): + if len(var1) != len(var2): + return False + for v1, v2 in zip(var1, var2): + if isinstance(v1, dict): + if not are_dicts_close(v1, v2): + return False + elif isinstance(v1, + (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)): + if not are_csc_matrix_close(v1, v2): + return False + elif isinstance(v1, bool): + if not v1 == v2: + return False + else: + try: + if not np.allclose(v1, v2): + return False + except ValueError as e: + print(e) + if not v1 == v2: + return False + return True + + +@ICL_EVALUATORS.register_module() +class SciCodeEvaluator(BaseEvaluator): + + def __init__(self, dataset_path, with_bg, testcode_path='./tmp/scicode'): + super().__init__() + test_data = [] + dataset_path = get_data_path(dataset_path, local_mode=True) + if with_bg: # test with background + file_path = osp.join(dataset_path, + 'SciCode_datasets_with_background.json') + else: # test w/o background + file_path = osp.join(dataset_path, 'SciCode_datasets.json') + with open(file_path, 'r', encoding='utf-8') as file: + test_data = json.load(file) + self.dataset = Dataset.from_list(test_data) + self.testcode_path = testcode_path + H5PY_FILE = osp.join(dataset_path, 'test_data.h5') # noqa: F841 + + def extract_python_script(self, response: str): + start_marker = '```python' + end_marker = '```' + + if start_marker not in response or end_marker not in response: + # If the markers are not present, return an empty string + # print("fail to follow the instruct") + return '' + + # Split the response at the start marker and take the second part + after_start = response.split(start_marker) + if len(after_start) < 2: + return '' # No valid split was made + + # Split the part after the start marker at the end marker + python_script = after_start[1].split(end_marker)[0] + + # Remove leading import statements using regex + python_script = re.sub(r'^\s*(import .*|from .*\s+import\s+.*)', + '', + python_script, + flags=re.MULTILINE) + + return python_script + + def run_script(self, script_path): + try: + subprocess.run(['python', script_path], + check=True, + capture_output=True, + text=True, + timeout=60) + return 0 + except subprocess.CalledProcessError: + return 1 + except subprocess.TimeoutExpired: + return 2 + + def score(self, predictions, references): + correct, sub_correct = 0, 0 + count, sub_count = 0, 0 + details = [] + + # generate all python test codes and than test + for idx, prediction_list in enumerate(predictions): + # traverse each test sample + problem_id = self.dataset[idx]['id'] + num_of_subproblems = len(prediction_list) + + # create dir for each test sample + testdir_path = os.path.join(self.testcode_path, str(problem_id)) + os.makedirs(testdir_path, exist_ok=True) + + python_code = '' + # add import statement + python_code += self.dataset[idx]['import'] + + is_all_correct = True + for sub_idx in range(num_of_subproblems): + # extract code + response = prediction_list[sub_idx] + python_code += self.extract_python_script(response) + + # process special examples + if problem_id == '13' and sub_idx >= 5 or \ + problem_id == '62' and sub_idx >= 0 or \ + problem_id == '76' and sub_idx >= 2: + sub_idx += 1 + + # test cases + test_lst = self.dataset[idx]['test'][sub_idx] + + testfile_path = os.path.join(testdir_path, + f'{problem_id}-{sub_idx + 1}.py') + # write python code and test cases to a real python file + with open(testfile_path, 'w', encoding='utf-8') as f: + f.write(python_code) + f.write(""" + +from opencompass.datasets.scicode import process_hdf5_to_tuple + +""") + f.write('targets = process_hdf5_to_tuple(' + + f"'{problem_id}.{sub_idx + 1}', {len(test_lst)})" + + '\n') + for idx2 in range(len(test_lst)): + f.write(f'target = targets[{idx2}]\n\n') + for line in test_lst[idx2].split('\n'): + f.write(line + '\n') + + # test + ret = self.run_script(testfile_path) + msg = {'problem': f'{problem_id}-{sub_idx + 1}'} + if ret == 0: # correct + sub_correct += 1 + msg['is_correct'] = True + elif ret == 1: # error + is_all_correct = False + msg['is_correct'] = False + else: # time out + is_all_correct = False + msg['is_correct'] = False + sub_count += 1 + details.append(msg) + + correct += is_all_correct + count += 1 + + result = { + 'accuracy': 100 * correct / count, + 'sub_accuracy': 100 * sub_correct / sub_count, + 'details': details + } + return result diff --git a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py index 681afc35..c91748fc 100644 --- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py @@ -359,7 +359,7 @@ class ChatInferencer(BaseInferencer): idx=index, gold=chat[i]['content'], ) - index += 1 + # index += 1 if self.dialogue_mode: # dialogue mode for subjective evaluation assert len(chat) % 2 == 0 diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index a8acebb6..409c8136 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -356,4 +356,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", "md5": "c60bdfff3d02358067104cc1dea7c0f7", }, + "scicode/": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", + "md5": "06f64edad6680072e5bca3f0ce892d0c", + } } diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 25a9b73f..320f07d9 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -9,6 +9,7 @@ evaluate>=0.3.0 fairscale func_timeout fuzzywuzzy +h5py immutabledict jieba json5