Merge branch 'main' into SeedBench

2025-05-30 16:03:24 +08:00 · 2025-04-25 17:04:25 +08:00 · 2025-04-25 17:04:25 +08:00 · d26e808c9f
commit d26e808c9f
parent 2ded84a70c e8bc8c1e8c
21 changed files with 477 additions and 32 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -8,6 +8,7 @@ exclude: |
      opencompass/datasets/lawbench/utils|
      opencompass/datasets/lawbench/evaluation_functions/|
      opencompass/datasets/medbench/|
      opencompass/datasets/matbench/|
      opencompass/datasets/teval/|
      opencompass/datasets/NPHardEval/|
      opencompass/datasets/TheoremQA|
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -110,6 +110,12 @@
    paper: ''
    configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
    configpath_llmjudge: ''
 - matbench:
    name: matbench
    category: Science / Material
    paper: 'https://www.nature.com/articles/s41524-020-00406-3'
    configpath: opencompass/configs/datasets/matbench/matbench_gen_f71840.py
    configpath_llmjudge: ''
 - medbench:
    name: MedBench
    category: Knowledge / Medicine
--- a/examples/eval_rewardbench.py
+++ b/examples/eval_rewardbench.py
@ -0,0 +1,53 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
    from opencompass.configs.summarizers.rewardbench import summarizer
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
 from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
 from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ]
 )
 datasets = [*get_rewardbench_datasets]
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='qwen-7b-hf',
        path='Qwen/Qwen-7B',
        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
        max_seq_len=16384,
        max_out_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    ),
 ]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=LocalRunner,
        max_num_workers=72,
        task=dict(type=OpenICLInferTask),
    ),
 )
 work_dir = './outputs/rewardbench/'
--- a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
@ -86,7 +86,7 @@ for _name in chembench_all_sets:
            ),
            dataset_cfg=dict(
                type=ChemBenchDataset,
-                path='/fs-computility/llm/xiaolinchen/opencompass_fork/data/ChemBench4K',
+                path='opencompass/ChemBench4K',
                name=_name,
                reader_cfg=chembench_reader_cfg,
            ),
--- a/opencompass/configs/datasets/judge/rewardbench.py
+++ b/opencompass/configs/datasets/judge/rewardbench.py
@ -0,0 +1,71 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import JudgeEvaluator
 from opencompass.datasets import RewardBenchDataset
 subjective_reader_cfg = dict(
    input_columns=['prompt'],
    output_column='judge',
    )
 data_path = './data/judgeeval/rewardbench'
 subjective_all_sets = ['llmbar-natural.json', 'llmbar-adver-GPTInst.json', 'hep-go.json', 'refusals-dangerous.json', 'hep-cpp.json', 'mt-bench-easy.json', 'alpacaeval-length.json', 'llmbar-adver-neighbor.json', 'alpacaeval-easy.json', 'hep-java.json', 'llmbar-adver-GPTOut.json', 'mt-bench-hard.json', 'xstest-should-respond.json', 'xstest-should-refuse.json', 'hep-python.json', 'refusals-offensive.json', 'alpacaeval-hard.json', 'llmbar-adver-manual.json', 'hep-js.json', 'math-prm.json', 'hep-rust.json', 'mt-bench-med.json', 'donotanswer.json']
 get_rewardbench_datasets = []
 prompt_choice_prefix = """
 Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
 - Do not let the order of presentation, response length, or assistant names influence your judgment.
 - Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
 Your final reply must be structured in the following format:
 {
  "Choice": "[Model A or Model B]"
 }
 """
 prompt_choice_en = """User Question: {question}
 Model A's Response: {answerA}
 Model B's Response: {answerB}
 Now it's your turn. Please provide selection result as required:
 """
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt=prompt_choice_prefix + prompt_choice_en
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_out_len=4096),
        )
    rewardbench_eval_cfg = dict(
        evaluator=dict(
            type=JudgeEvaluator,
        ),
    )
    get_rewardbench_datasets.append(
        dict(
            abbr=f'{_name.split(".")[0]}',
            type=RewardBenchDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=rewardbench_eval_cfg,
            mode='singlescore',
        ))
--- a/opencompass/configs/datasets/matbench/matbench_gen.py
+++ b/opencompass/configs/datasets/matbench/matbench_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .matbench_gen_f71840 import matbench_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
+++ b/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
@ -0,0 +1,55 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
 matbench_reader_cfg = dict(
    input_columns=['problem'], output_column='answer')
 matbench_tasks =  ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
 matbench_datasets = []
 for task in matbench_tasks:
    if task in ['matbench_expt_is_metal','matbench_glass']:
        matbench_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
        matbench_eval_cfg = dict(
            evaluator=dict(type=MatbenchEvaluator_classification),
            pred_role='BOT')
    elif task in ['matbench_steels','matbench_expt_gap']:
        matbench_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
        matbench_eval_cfg = dict(
            evaluator=dict(type=MatbenchEvaluator_regression),
            pred_role='BOT')
    matbench_datasets.append(
        dict(
            type=MatbenchDataset,
            path=f'opencompass/Matbench',
            task=task,
            abbr=task,
            reader_cfg=matbench_reader_cfg,
            infer_cfg=matbench_infer_cfg,
            eval_cfg=matbench_eval_cfg))
--- a/opencompass/configs/summarizers/rewardbench.py
+++ b/opencompass/configs/summarizers/rewardbench.py
@ -0,0 +1,11 @@
 RewardBench_summary_groups = []
 _RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
 RewardBench_summary_groups.append({'name': 'RewardBench', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
 summarizer = dict(
    dataset_abbrs=[
        'RewardBench'
    ],
    summary_groups=RewardBench_summary_groups,
 )
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@ -33,7 +33,12 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
            try:
                with time_limit(1):
                    tmp = str(latex2sympy(pred))
-                    pred = str(eval(tmp))
+                    pred = eval(tmp)
                    if isinstance(pred, tuple):
                        pred = str(list(pred))
                    else:
                        pred = str(pred)
            except Exception:
                if re.match(r'-?[\d\.]+\s\D+$', pred):
                    pred = pred.split(' ')[0]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -71,6 +71,7 @@ from .infinitebench import *  # noqa: F401, F403
 from .iwslt2017 import *  # noqa: F401, F403
 from .jigsawmultilingual import *  # noqa: F401, F403
 from .jsonl import JsonlDataset  # noqa: F401, F403
 from .judge import *  # noqa: F401, F403
 from .kaoshi import KaoshiDataset, KaoshiEvaluator  # noqa: F401, F403
 from .korbench import *  # noqa: F401, F403
 from .lambada import *  # noqa: F401, F403
@ -87,6 +88,7 @@ from .longbench import *  # noqa: F401, F403
 from .longbenchv2 import *  # noqa: F401, F403
 from .lveval import *  # noqa: F401, F403
 from .mastermath2024v1 import *  # noqa: F401, F403
 from .matbench import *  # noqa: F401, F403
 from .math import *  # noqa: F401, F403
 from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@ -1,7 +1,6 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Union
-from datasets import Dataset, DatasetDict
+from datasets import Dataset, DatasetDict, concatenate_datasets
 from opencompass.openicl import DatasetReader
@ -19,28 +18,25 @@ class BaseDataset:
        assert (max(k) if isinstance(k, List) else
                k) <= n, 'Maximum value of `k` must less than or equal to `n`'
        if isinstance(dataset, Dataset):
-            examples = []
+            dataset = dataset.map(lambda x, idx: {
-            for idx, example in enumerate(dataset):
+                'subdivision': abbr,
-                if 'subdivision' not in example:
+                'idx': idx
-                    example['subdivision'] = abbr
+            },
-                if 'idx' not in example:
+                                  with_indices=True,
-                    example['idx'] = idx
+                                  writer_batch_size=16)
-                examples.append(example)
+            dataset = concatenate_datasets([dataset] * n)
-            examples = sum([deepcopy(examples) for _ in range(n)], [])
+            self.dataset = dataset
            self.dataset = Dataset.from_list(examples)
        else:
            self.dataset = DatasetDict()
            for key in dataset:
-                examples = []
+                dataset[key] = dataset[key].map(lambda x, idx: {
-                for idx, example in enumerate(dataset[key]):
+                    'subdivision': f'{abbr}_{key}',
-                    if 'subdivision' not in example:
+                    'idx': idx
-                        example['subdivision'] = f'{abbr}_{key}'
+                },
-                    if 'idx' not in example:
+                                                with_indices=True,
-                        example['idx'] = idx
+                                                writer_batch_size=16)
-                    examples.append(example)
+                dataset[key] = concatenate_datasets([dataset[key]] * n)
-                print(abbr, key, len(examples))
+                self.dataset[key] = dataset[key]
                examples = sum([deepcopy(examples) for _ in range(n)], [])
                self.dataset[key] = Dataset.from_list(examples)
        self._init_reader(**reader_cfg)
    def _init_reader(self, **kwargs):
--- a/opencompass/datasets/judge/init.py
+++ b/opencompass/datasets/judge/init.py
@ -0,0 +1 @@
 from .rewardbench import RewardBenchDataset  # noqa: F401, F403
--- a/opencompass/datasets/judge/rewardbench.py
+++ b/opencompass/datasets/judge/rewardbench.py
@ -0,0 +1,57 @@
 # flake8: noqa
 import json
 import os.path as osp
 import re
 import numpy as np
 import pandas as pd
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
                                  LOAD_DATASET)
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class RewardBenchDataset(BaseDataset):
    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for item in data:
                conversation_a = item['chosen']
                conversation_b = item['rejected']
                model_a = item['chosen_model']
                model_b = item['rejected_model']
                question = item['prompt']
                winner = item['winner']
                if winner == 'B':
                    conversation_a, conversation_b = conversation_b, conversation_a
                    model_a, model_b = model_b, model_a
                subset = item['subset']
                lan = 'en'
                raw_data.append({
                    'question': question,
                    'answerA': conversation_a,
                    'answerB': conversation_b,
                    'judge': {
                        'prompt': item['prompt'],
                        'Answer_A': conversation_a,
                        'Answer_B': conversation_b,
                        'subset': subset,
                        'winner': winner,
                        'model_a': model_a,
                        'model_b': model_b,
                        'dataset_name': 'rewardbench',
                        'lan': lan
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
--- a/opencompass/datasets/matbench/init.py
+++ b/opencompass/datasets/matbench/init.py
@ -0,0 +1,3 @@
 # flake8: noqa
 from .matbench import *  # noqa: F401, F403
--- a/opencompass/datasets/matbench/matbench.py
+++ b/opencompass/datasets/matbench/matbench.py
@ -0,0 +1,87 @@
 import json
 import os
 from datasets import Dataset
 from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
 from opencompass.datasets.matbench.post_process import (parse_float_answer,
                                                        parse_true_false_answer
                                                        )
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class MatbenchDataset(BaseDataset):
    @staticmethod
    def load(path, task):
        path = get_data_path(path)
        path = os.path.join(path,
                            'matbench_base_fold_0_' + task + '_test.json')
        dataset = []
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for item in data:
                dataset.append({
                    'problem': item['problem'],
                    'answer': item['answer'],
                })
        dataset = Dataset.from_list(dataset)
        return dataset
@ICL_EVALUATORS.register_module()
 class MatbenchEvaluator_regression(BaseEvaluator):
    def score(self, predictions, references):
        mae_sum = 0
        count = 0
        details = []
        for pred, ref in zip(predictions, references):
            pred = parse_float_answer(pred)
            detail = {'pred': pred, 'answer': ref, 'error': None}
            count += 1
            try:
                error = abs(float(pred) - float(ref))
                mae_sum += error
                detail['error'] = error
            except Exception as e:
                detail['error'] = str(e)
            details.append(detail)
        mae = mae_sum / count if count > 0 else 0
        result = {'mae': mae, 'details': details}
        return result
@ICL_EVALUATORS.register_module()
 class MatbenchEvaluator_classification(BaseEvaluator):
    def score(self, predictions, references):
        details = []
        predictions_parsed = []
        for pred, ref in zip(predictions, references):
            pred = parse_true_false_answer(pred)
            detail = {'pred': pred, 'answer': ref, 'correct': False}
            if pred == ref:
                detail['correct'] = True
            details.append(detail)
            predictions_parsed.append(pred)
        accuracy = accuracy_score(references, predictions_parsed)
        precision = precision_score(references,
                                    predictions_parsed,
                                    average='binary')
        recall = recall_score(references, predictions_parsed, average='binary')
        f1 = f1_score(references, predictions_parsed, average='binary')
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'details': details
        }
--- a/opencompass/datasets/matbench/post_process.py
+++ b/opencompass/datasets/matbench/post_process.py
@ -0,0 +1,25 @@
 # flake8: noqa
 import re
 def parse_float_answer(raw_string, option=''):
    number_pattern = re.compile(r'[-+]?\d+(\.\d+)?([eE][-+]?\d+)?')
    # Search for the first match
    match = number_pattern.search(raw_string)
    if match:
        # Extract the matched number and convert it to float
        return float(match.group())
    else:
        # Return None if no number is found
        return 0
 def parse_true_false_answer(raw_string, option=''):
    if 'yes' in raw_string.lower():
        return True
    elif 'no' in raw_string.lower():
        return False
    else:
        return True
--- a/opencompass/datasets/smolinstruct.py
+++ b/opencompass/datasets/smolinstruct.py
@ -4,10 +4,12 @@ from collections import defaultdict
 import numpy as np
 from datasets import Dataset, DatasetDict, load_dataset
 from nltk.translate.meteor_score import meteor_score
 from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
 from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
                                  TEXT_POSTPROCESSORS)
 from opencompass.utils import get_logger
 from .base import BaseDataset
@ -408,7 +410,14 @@ class MeteorEvaluator(BaseEvaluator):
        avg_score = 0
        details = []
        for pred, ans in zip(predictions, references):
-            score = meteor_score([ans.split()], pred.split())
+            try:
                score = (meteor_score([ans.split()], pred.split())
                         if ans and pred else 0.0)
            except AttributeError:
                self.logger = get_logger()
                self.logger.warning(f'Failed to compute METEOR'
                                    f"score:\npred='{pred}'\nans='{ans}'")
                score = 0.0
            avg_score += score
            detail = {'pred': pred, 'answer': ans, 'score': score}
            details.append(detail)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -661,18 +661,32 @@ class OpenAISDK(OpenAI):
                        pass  # noqa F841
                # Check if response is empty or content is empty
-                if not responses.choices or not responses.choices[
+                if (not responses.choices or not responses.choices[0].message
-                        0].message.content:
+                        or not responses.choices[0].message.content):
                    self.logger.error(
-                        'API response is empty, it might be due to excessive '
+                        'Failed to extract content from the responses. '
-                        'input length or an internal server error '
+                        'Please check the API response for detail information.'
-                        'from your API provider.')
+                        'API responses: %s',
                        responses,
                    )
                    num_retries += 1
                    # Continue to retry instead of returning empty response
                    continue
-                # If the model has reasoning_content, concat it
+
-                # with the content
+                # Concat Reasoning Content and tags to content
-                if hasattr(responses.choices[0].message, 'reasoning_content'):
+                if (hasattr(responses.choices[0].message, 'reasoning_content')
                        and responses.choices[0].message.reasoning_content):
                    if self.verbose:
                        self.logger.info(
                            'Follow'
                            'vllm/reasoning/deepseek_r1_reasoning_parser'
                            'to parse the reasoning content and tags'
                            'Reasoning Content: %s, \n'
                            'Tags: %s, \n'
                            'Content: %s',
                            responses.choices[0].message.reasoning_content,
                            self.think_tag,
                            responses.choices[0].message.content)
                    return (responses.choices[0].message.reasoning_content +
                            self.think_tag +
                            responses.choices[0].message.content)
--- a/opencompass/openicl/icl_evaluator/init.py
+++ b/opencompass/openicl/icl_evaluator/init.py
@ -6,6 +6,7 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from .icl_judge_evaluator import JudgeEvaluator  # noqa
 from .icl_misc_evaluator import AverageInferencePPLEvaluator  # noqa
 from .icl_misc_evaluator import AverageMinKEvaluator  # noqa
 from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
@ -0,0 +1,33 @@
 # flake8: noqa
 """KOR-Bench Evaluator."""
 import json
 import os
 import re
 from .icl_base_evaluator import BaseEvaluator
 class JudgeEvaluator(BaseEvaluator):
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}
        correct = 0
        count = 0
        details = []
        for prediction, reference in zip(predictions, references):
            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
            gold_winner = reference.get('winner', '')
            detail = {
                'pred': prediction,
                'answer': gold_winner,
                'correct': False
            }
            count += 1
            if choice == gold_winner:
                correct += 1
                detail['correct'] = True
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -27,6 +27,12 @@ DATASETS_MAPPING = {
        "hf_id": "opencompass/ai2_arc",
        "local": "./data/ARC/ARC-e/ARC-Easy-Dev.jsonl",
    },
    # Matbench
    "opencompass/Matbench": {
    # "ms_id": "opencompass/Matbench",
    "hf_id": "opencompass/Matbench",
    "local": "./data/Matbench",
    },
    # BBH
    "opencompass/bbh": {
        "ms_id": "opencompass/bbh",
@ -670,6 +676,11 @@ DATASETS_URL = {
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip",
        "md5": "1321cbf9349e1102a57d31d1b2bfdd7e",
    },
    "/Matbench":{
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Matbench.zip",
        "md5": "99f9457f54f4f419da9556af56ac4c24",
    },
    "mmlu_pro": {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
		`@ -0,0 +1 @@`
							`from .rewardbench import RewardBenchDataset # noqa: F401, F403`
		`@ -0,0 +1,3 @@`
							`# flake8: noqa`

							`from .matbench import * # noqa: F401, F403`