From c26ecdb1b05baea7bcf34c99ea245ae68a3ada83 Mon Sep 17 00:00:00 2001 From: Leymore Date: Mon, 28 Aug 2023 17:35:04 +0800 Subject: [PATCH] [Feature] Add and apply update suffix tool (#280) * add and apply update suffix tool * add dataset suffix updater as precommit hook * update workflow * update scripts * update ci * update * ci with py3.8 * run in serial * update bbh * use py 3.10 * update pre commit zh cn --- .github/workflows/lint.yml | 2 +- .pre-commit-config-zh-cn.yaml | 30 ++++- .pre-commit-config.yaml | 30 ++++- configs/datasets/game24/game24_gen.py | 2 +- ...e24_gen_8dfde3.py => game24_gen_52a460.py} | 0 configs/datasets/humanevalx/humanevalx_gen.py | 2 +- ...gen_fd5822.py => humanevalx_gen_620cfa.py} | 0 configs/datasets/z_bench/z_bench_gen.py | 2 +- ...ch_gen_5813ec.py => z_bench_gen_52ba2f.py} | 2 +- ...ch_gen_61db0a.py => z_bench_gen_d8c84c.py} | 2 +- docs/en/tools.md | 10 ++ docs/zh_cn/tools.md | 10 ++ tools/update_dataset_suffix.py | 123 ++++++++++++++++++ 13 files changed, 195 insertions(+), 20 deletions(-) rename configs/datasets/game24/{game24_gen_8dfde3.py => game24_gen_52a460.py} (100%) rename configs/datasets/humanevalx/{humanevalx_gen_fd5822.py => humanevalx_gen_620cfa.py} (100%) rename configs/datasets/z_bench/{z_bench_gen_5813ec.py => z_bench_gen_52ba2f.py} (97%) rename configs/datasets/z_bench/{z_bench_gen_61db0a.py => z_bench_gen_d8c84c.py} (97%) create mode 100755 tools/update_dataset_suffix.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7d1d2aac..ae9a9bd2 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,7 +17,7 @@ jobs: python-version: '3.10' - name: Install pre-commit hook run: | - pip install pre-commit + pip install pre-commit mmengine pre-commit install - name: Linting run: pre-commit run --all-files diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml index d5aff2c5..a464f527 100644 --- a/.pre-commit-config-zh-cn.yaml +++ b/.pre-commit-config-zh-cn.yaml @@ -2,30 +2,33 @@ exclude: | (?x)^( tests/data/| opencompass/models/internal/| - opencompass/utils/internal/| - configs/ + opencompass/utils/internal/ ) repos: - repo: https://gitee.com/openmmlab/mirrors-flake8 rev: 5.0.4 hooks: - id: flake8 + exclude: configs/ - repo: https://gitee.com/openmmlab/mirrors-isort rev: 5.11.5 hooks: - id: isort + exclude: configs/ - repo: https://gitee.com/openmmlab/mirrors-yapf rev: v0.32.0 hooks: - id: yapf + exclude: configs/ - repo: https://gitee.com/openmmlab/mirrors-codespell rev: v2.2.1 hooks: - id: codespell - exclude: > + exclude: | (?x)^( - .*\.jsonl - )$ + .*\.jsonl| + configs/ + ) - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks rev: v4.3.0 hooks: @@ -33,17 +36,20 @@ repos: exclude: | (?x)^( dicts/| - projects/.*?/dicts/ + projects/.*?/dicts/| + configs/ ) - id: check-yaml - id: end-of-file-fixer exclude: | (?x)^( dicts/| - projects/.*?/dicts/ + projects/.*?/dicts/| + configs/ ) - id: requirements-txt-fixer - id: double-quote-string-fixer + exclude: configs/ - id: check-merge-conflict - id: fix-encoding-pragma args: ["--remove"] @@ -60,11 +66,21 @@ repos: - mdformat-openmmlab - mdformat_frontmatter - linkify-it-py + exclude: configs/ - repo: https://gitee.com/openmmlab/mirrors-docformatter rev: v1.3.1 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] + - repo: local + hooks: + - id: update-dataset-suffix + name: dataset suffix updater + entry: ./tools/update_dataset_suffix.py + language: script + pass_filenames: true + require_serial: true + files: ^configs/datasets # - repo: https://github.com/open-mmlab/pre-commit-hooks # rev: v0.2.0 # Use the ref you want to point at # hooks: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 187ea5fc..8158381e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,30 +2,33 @@ exclude: | (?x)^( tests/data/| opencompass/models/internal/| - opencompass/utils/internal/| - configs/ + opencompass/utils/internal/ ) repos: - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: - id: flake8 + exclude: configs/ - repo: https://github.com/PyCQA/isort rev: 5.11.5 hooks: - id: isort + exclude: configs/ - repo: https://github.com/pre-commit/mirrors-yapf rev: v0.32.0 hooks: - id: yapf + exclude: configs/ - repo: https://github.com/codespell-project/codespell rev: v2.2.1 hooks: - id: codespell - exclude: > + exclude: | (?x)^( - .*\.jsonl - )$ + .*\.jsonl| + configs/ + ) - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: @@ -33,17 +36,20 @@ repos: exclude: | (?x)^( dicts/| - projects/.*?/dicts/ + projects/.*?/dicts/| + configs/ ) - id: check-yaml - id: end-of-file-fixer exclude: | (?x)^( dicts/| - projects/.*?/dicts/ + projects/.*?/dicts/| + configs/ ) - id: requirements-txt-fixer - id: double-quote-string-fixer + exclude: configs/ - id: check-merge-conflict - id: fix-encoding-pragma args: ["--remove"] @@ -60,11 +66,21 @@ repos: - mdformat-openmmlab - mdformat_frontmatter - linkify-it-py + exclude: configs/ - repo: https://github.com/myint/docformatter rev: v1.3.1 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] + - repo: local + hooks: + - id: update-dataset-suffix + name: dataset suffix updater + entry: ./tools/update_dataset_suffix.py + language: script + pass_filenames: true + require_serial: true + files: ^configs/datasets # - repo: https://github.com/open-mmlab/pre-commit-hooks # rev: v0.2.0 # Use the ref you want to point at # hooks: diff --git a/configs/datasets/game24/game24_gen.py b/configs/datasets/game24/game24_gen.py index 89de8559..25defc80 100644 --- a/configs/datasets/game24/game24_gen.py +++ b/configs/datasets/game24/game24_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .game24_gen_8dfde3 import game24_datasets # noqa: F401, F403 + from .game24_gen_52a460 import game24_datasets # noqa: F401, F403 diff --git a/configs/datasets/game24/game24_gen_8dfde3.py b/configs/datasets/game24/game24_gen_52a460.py similarity index 100% rename from configs/datasets/game24/game24_gen_8dfde3.py rename to configs/datasets/game24/game24_gen_52a460.py diff --git a/configs/datasets/humanevalx/humanevalx_gen.py b/configs/datasets/humanevalx/humanevalx_gen.py index ac2b34e7..52238a17 100644 --- a/configs/datasets/humanevalx/humanevalx_gen.py +++ b/configs/datasets/humanevalx/humanevalx_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .humanevalx_gen_fd5822 import humanevalx_datasets # noqa: F401, F403 \ No newline at end of file + from .humanevalx_gen_620cfa import humanevalx_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/humanevalx/humanevalx_gen_fd5822.py b/configs/datasets/humanevalx/humanevalx_gen_620cfa.py similarity index 100% rename from configs/datasets/humanevalx/humanevalx_gen_fd5822.py rename to configs/datasets/humanevalx/humanevalx_gen_620cfa.py diff --git a/configs/datasets/z_bench/z_bench_gen.py b/configs/datasets/z_bench/z_bench_gen.py index 5d71997a..a30a1a12 100644 --- a/configs/datasets/z_bench/z_bench_gen.py +++ b/configs/datasets/z_bench/z_bench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .z_bench_gen_5813ec import z_bench_dataset # noqa: F401, F403 + from .z_bench_gen_52ba2f import z_bench_datasets # noqa: F401, F403 diff --git a/configs/datasets/z_bench/z_bench_gen_5813ec.py b/configs/datasets/z_bench/z_bench_gen_52ba2f.py similarity index 97% rename from configs/datasets/z_bench/z_bench_gen_5813ec.py rename to configs/datasets/z_bench/z_bench_gen_52ba2f.py index 49077976..39e3f1f9 100644 --- a/configs/datasets/z_bench/z_bench_gen_5813ec.py +++ b/configs/datasets/z_bench/z_bench_gen_52ba2f.py @@ -14,7 +14,7 @@ z_bench_infer_cfg = dict( retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer)) -z_bench_dataset = dict( +z_bench_datasets = dict( type=HFDataset, path= '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', diff --git a/configs/datasets/z_bench/z_bench_gen_61db0a.py b/configs/datasets/z_bench/z_bench_gen_d8c84c.py similarity index 97% rename from configs/datasets/z_bench/z_bench_gen_61db0a.py rename to configs/datasets/z_bench/z_bench_gen_d8c84c.py index 63cfded8..33149155 100644 --- a/configs/datasets/z_bench/z_bench_gen_61db0a.py +++ b/configs/datasets/z_bench/z_bench_gen_d8c84c.py @@ -17,7 +17,7 @@ z_bench_infer_cfg = dict( retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer)) -z_bench_dataset = dict( +z_bench_datasets = dict( type=HFDataset, path= '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', diff --git a/docs/en/tools.md b/docs/en/tools.md index 28872285..51438f9e 100644 --- a/docs/en/tools.md +++ b/docs/en/tools.md @@ -69,3 +69,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR] ``` - `-w`: Work path, default is `'./outputs/default'`. + +## Dataset Suffix Updater + +This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash. + +How to run: + +```bash +python tools/update_dataset_suffix.py +``` diff --git a/docs/zh_cn/tools.md b/docs/zh_cn/tools.md index ffb371b1..4a9c7bb6 100644 --- a/docs/zh_cn/tools.md +++ b/docs/zh_cn/tools.md @@ -78,3 +78,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR] ``` - `-w`:工作路径,默认为 `'./outputs/default'`。 + +## Dataset Suffix Updater + +本工具可以快速修改 `configs/dataset` 目录下的配置文件后缀,使其符合提示词哈希命名规范。 + +运行方式: + +```bash +python tools/update_dataset_suffix.py +``` diff --git a/tools/update_dataset_suffix.py b/tools/update_dataset_suffix.py new file mode 100755 index 00000000..4bf3c9a3 --- /dev/null +++ b/tools/update_dataset_suffix.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +import argparse +import glob +import hashlib +import json +import os +import re +from multiprocessing import Pool +from typing import List, Union + +from mmengine.config import Config, ConfigDict + + +# from opencompass.utils import get_prompt_hash +# copied from opencompass.utils.get_prompt_hash, for easy use in ci +def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str: + """Get the hash of the prompt configuration. + + Args: + dataset_cfg (ConfigDict or list[ConfigDict]): The dataset + configuration. + + Returns: + str: The hash of the prompt configuration. + """ + if isinstance(dataset_cfg, list): + if len(dataset_cfg) == 1: + dataset_cfg = dataset_cfg[0] + else: + hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg]) + hash_object = hashlib.sha256(hashes.encode()) + return hash_object.hexdigest() + if 'reader_cfg' in dataset_cfg.infer_cfg: + # new config + reader_cfg = dict(type='DatasetReader', + input_columns=dataset_cfg.reader_cfg.input_columns, + output_column=dataset_cfg.reader_cfg.output_column) + dataset_cfg.infer_cfg.reader = reader_cfg + if 'train_split' in dataset_cfg.infer_cfg.reader_cfg: + dataset_cfg.infer_cfg.retriever[ + 'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][ + 'train_split'] + if 'test_split' in dataset_cfg.infer_cfg.reader_cfg: + dataset_cfg.infer_cfg.retriever[ + 'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split + for k, v in dataset_cfg.infer_cfg.items(): + dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1] + d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True) + hash_object = hashlib.sha256(d_json.encode()) + return hash_object.hexdigest() + + +# Assuming get_hash is a function that computes the hash of a file +# from get_hash import get_hash +def get_hash(path): + cfg = Config.fromfile(path) + for k in cfg.keys(): + if k.endswith('_datasets'): + return get_prompt_hash(cfg[k])[:6] + print(f'Could not find *_datasets in {path}') + return None + + +def check_and_rename(filepath): + base_name = os.path.basename(filepath) + match = re.match(r'(.*)_(gen|ppl)_(.*).py', base_name) + if match: + dataset, mode, old_hash = match.groups() + new_hash = get_hash(filepath) + if not new_hash: + return None, None + if old_hash != new_hash: + new_name = f'{dataset}_{mode}_{new_hash}.py' + new_file = os.path.join(os.path.dirname(filepath), new_name) + print(f'Rename {filepath} to {new_file}') + return filepath, new_file + return None, None + + +def update_imports(data): + python_file, name_pairs = data + for filepath, new_file in name_pairs: + old_name = os.path.basename(filepath)[:-3] + new_name = os.path.basename(new_file)[:-3] + if not os.path.exists(python_file): + return + with open(python_file, 'r') as file: + filedata = file.read() + # Replace the old name with new name + new_data = filedata.replace(old_name, new_name) + if filedata != new_data: + with open(python_file, 'w') as file: + file.write(new_data) + # print(f"Updated imports in {python_file}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('python_files', nargs='*') + args = parser.parse_args() + + root_folder = 'configs/datasets' + if args.python_files: + python_files = [ + i for i in args.python_files if i.startswith(root_folder) + ] + else: + python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True) + + # Use multiprocessing to speed up the check and rename process + with Pool(16) as p: + name_pairs = p.map(check_and_rename, python_files) + name_pairs = [pair for pair in name_pairs if pair[0] is not None] + with Pool(16) as p: + p.starmap(os.rename, name_pairs) + # python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True) + update_data = [(python_file, name_pairs) for python_file in python_files] + with Pool(16) as p: + p.map(update_imports, update_data) + + +if __name__ == '__main__': + main()