From 8e6d2ab7e6a0ea6ee09ea49d2ce5a99cbb34121e Mon Sep 17 00:00:00 2001 From: zhanghaoyu <15981853295@163.com> Date: Mon, 14 Apr 2025 09:51:43 +0800 Subject: [PATCH] feat(datasets): add MaritimeBench dataset and related configuration Added MaritimeBench dataset, including dataset metadata, configuration files, data processing logic, and a text post-processing function. This dataset is designed to evaluate AI models' domain knowledge and reasoning ability in the maritime field. --- .../configs/datasets/maritimebench/README.md | 61 ++++++++++++++++++ .../maritimebench/maritimebench_gen.py | 42 ++++++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/maritime_bench.py | 64 +++++++++++++++++++ opencompass/utils/datasets_info.py | 5 ++ opencompass/utils/text_postprocessors.py | 7 ++ 6 files changed, 180 insertions(+) create mode 100644 opencompass/configs/datasets/maritimebench/README.md create mode 100644 opencompass/configs/datasets/maritimebench/maritimebench_gen.py create mode 100644 opencompass/datasets/maritime_bench.py diff --git a/opencompass/configs/datasets/maritimebench/README.md b/opencompass/configs/datasets/maritimebench/README.md new file mode 100644 index 00000000..4a2cab19 --- /dev/null +++ b/opencompass/configs/datasets/maritimebench/README.md @@ -0,0 +1,61 @@ +## 📘 About MaritimeBench + +**MaritimeBench** 是航运行业首个基于“学科(一级)- 子学科(二级)- 具体考点(三级)”分类体系构建的专业知识评测集。该数据集包含 **1888 道客观选择题**,覆盖以下核心领域: + +- 航海 +- 轮机 +- 电子电气员 +- GMDSS(全球海上遇险与安全系统) +- 船员培训 + +评测内容涵盖理论知识、操作技能及行业规范,旨在: + +- 提升 AI 模型在航运领域的 **理解与推理能力** +- 确保其在关键知识点上的 **准确性与适应性** +- 支持航运专业考试、船员培训及资质认证的 **自动化测评** +- 优化船舶管理、导航操作、海上通信等场景下的 **智能问答与决策系统** + +MaritimeBench 基于行业权威标准,构建了 **系统、科学的知识评测体系**,全面衡量模型在航运各专业领域的表现,助力其专业化发展。 + +--- + +## 🧪 示例 + +请回答单选题。要求只输出选项,不输出解释,将选项放在 `< >` 内,直接输出答案。 + +**题目 1:** +在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。 +选项: +A. 电磁力 +B. 压拉应力 +C. 弯曲应力 +D. 扭应力 +**答:** `` + +**题目 2:** +当船舶实行 PMS 检验时,应将 CCS 现行规范中规定的特别检验纳入在 PMS 计划表中,下列应包括______。 +① 每年应进行的确认性检查项目 +② 每年应进行的拆检项目 +③ 5 年内应拆检的项目 +④ 5 年内应进行的确认性检查项目 +选项: +A. ①④ +B. ②④ +C. ①③ +D. ①②③④ +**答:** `` + +--- + +## 📂 Dataset Links + +- [MaritimeBench on Hugging Face](https://huggingface.co/datasets/Hi-Dolphin/MaritimeBench) +- [MaritimeBench on ModelScope](https://modelscope.cn/datasets/HiDolphin/MaritimeBench/summary) + +--- + +## 📊 模型测试结果 + +| dataset | version | metric | mode | Qwen2.5-32B | +|----- | ----- | ----- | ----- | -----| +| maritimebench | 6d56ec | accuracy | gen | 72.99 | diff --git a/opencompass/configs/datasets/maritimebench/maritimebench_gen.py b/opencompass/configs/datasets/maritimebench/maritimebench_gen.py new file mode 100644 index 00000000..6c237b08 --- /dev/null +++ b/opencompass/configs/datasets/maritimebench/maritimebench_gen.py @@ -0,0 +1,42 @@ +from opencompass.datasets import MaritimeBenchDataset +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.utils.text_postprocessors import parse_bracketed_answer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +maritimebench_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer', + train_split='test' # 明确指定使用test分割 +) + +maritimebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答: 当前题目:\n {question}\nA:{A}\nB:{B}\nC:{C}\nD:{D}') + ] + ), + ), + retriever=dict(type=ZeroRetriever), # 不使用上下文 + inferencer=dict(type=GenInferencer) # 添加推理器配置 +) + +maritimebench_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=parse_bracketed_answer, options='A|B|C|D') +) + +maritimebench_datasets = [ + dict( + abbr='maritimebench', + type=MaritimeBenchDataset, + name='default', + path='opencompass/maritimebench', + reader_cfg=maritimebench_reader_cfg, + infer_cfg=maritimebench_infer_cfg, + eval_cfg=maritimebench_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 3e2d0eef..9cc5043c 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -85,6 +85,7 @@ from .llm_compression import LLMCompressionDataset # noqa: F401, F403 from .longbench import * # noqa: F401, F403 from .longbenchv2 import * # noqa: F401, F403 from .lveval import * # noqa: F401, F403 +from .maritime_bench import * # noqa: F401, F403 from .mastermath2024v1 import * # noqa: F401, F403 from .math import * # noqa: F401, F403 from .math401 import * # noqa: F401, F403 diff --git a/opencompass/datasets/maritime_bench.py b/opencompass/datasets/maritime_bench.py new file mode 100644 index 00000000..089f8c27 --- /dev/null +++ b/opencompass/datasets/maritime_bench.py @@ -0,0 +1,64 @@ +import json +import os.path as osp +from os import environ + +import datasets +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MaritimeBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str) -> datasets.Dataset: + path = get_data_path(path) + dataset = DatasetDict() + dataset_list = [] + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['test']: + # 从 ModelScope 加载数据 + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + + for line in ms_dataset: + question = line['question'] + A = line['A'] + B = line['B'] + C = line['C'] + D = line['D'] + answer = line['answer'] + dataset_list.append({ + 'question': question, + 'A': A, + 'B': B, + 'C': C, + 'D': D, + 'answer': answer, + }) + # dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['test']: + filename = osp.join(path, split, f'{name}_{split}.jsonl') + with open(filename, encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset_list.append({ + 'question': data['question'], + 'A': data['A'], + 'B': data['B'], + 'C': data['C'], + 'D': data['D'], + 'answer': data['answer'] + }) + + dataset[split] = Dataset.from_list(dataset_list) + + return dataset diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 00db25e8..0e30a04d 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -420,6 +420,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/OlympiadBench", }, + "opencompass/maritimebench": { + "ms_id": "HiDolphin/MaritimeBench", + "hf_id": "Hi-Dolphin/MaritimeBench", + "local": "./data/maritimebench", + }, } DATASETS_URL = { diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index d21a06ab..cfbd7f22 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -283,3 +283,10 @@ def extract_non_reasoning_content( re.DOTALL) non_reasoning_content = reasoning_regex.sub('', text).strip() return non_reasoning_content + + +def parse_bracketed_answer(text: str, options: str) -> str: + match = re.search(rf'<({options})>', text) + if match: + return match.group(1) + return '' \ No newline at end of file