[Feature] Add and apply update suffix tool (#280)

* add and apply update suffix tool * add dataset suffix updater as precommit hook * update workflow * update scripts * update ci * update * ci with py3.8 * run in serial * update bbh * use py 3.10 * update pre commit zh cn
2025-05-30 16:03:24 +08:00 · 2023-08-28 17:35:04 +08:00 · 2023-08-28 17:35:04 +08:00 · c26ecdb1b0
commit c26ecdb1b0
parent b2d602f42b
13 changed files with 195 additions and 20 deletions
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -17,7 +17,7 @@ jobs:
          python-version: '3.10'
      - name: Install pre-commit hook
        run: |
-          pip install pre-commit
+          pip install pre-commit mmengine
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -2,30 +2,33 @@ exclude: |
    (?x)^(
      tests/data/|
      opencompass/models/internal/|
-      opencompass/utils/internal/|
+      opencompass/utils/internal/
      configs/
    )
 repos:
  - repo: https://gitee.com/openmmlab/mirrors-flake8
    rev: 5.0.4
    hooks:
      - id: flake8
        exclude: configs/
  - repo: https://gitee.com/openmmlab/mirrors-isort
    rev: 5.11.5
    hooks:
      - id: isort
        exclude: configs/
  - repo: https://gitee.com/openmmlab/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
        exclude: configs/
  - repo: https://gitee.com/openmmlab/mirrors-codespell
    rev: v2.2.1
    hooks:
      - id: codespell
-        exclude: >
+        exclude: |
            (?x)^(
-                .*\.jsonl
+                .*\.jsonl|
-            )$
+                configs/
            )
  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
    rev: v4.3.0
    hooks:
@ -33,17 +36,20 @@ repos:
        exclude: |
            (?x)^(
              dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
              configs/
            )
      - id: check-yaml
      - id: end-of-file-fixer
        exclude: |
            (?x)^(
              dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
              configs/
            )
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
        exclude: configs/
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
@ -60,11 +66,21 @@ repos:
          - mdformat-openmmlab
          - mdformat_frontmatter
          - linkify-it-py
        exclude: configs/
  - repo: https://gitee.com/openmmlab/mirrors-docformatter
    rev: v1.3.1
    hooks:
      - id: docformatter
        args: ["--in-place", "--wrap-descriptions", "79"]
  - repo: local
    hooks:
    -   id: update-dataset-suffix
        name: dataset suffix updater
        entry: ./tools/update_dataset_suffix.py
        language: script
        pass_filenames: true
        require_serial: true
        files: ^configs/datasets
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,30 +2,33 @@ exclude: |
    (?x)^(
      tests/data/|
      opencompass/models/internal/|
-      opencompass/utils/internal/|
+      opencompass/utils/internal/
      configs/
    )
 repos:
  - repo: https://github.com/PyCQA/flake8
    rev: 5.0.4
    hooks:
      - id: flake8
        exclude: configs/
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
      - id: isort
        exclude: configs/
  - repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
        exclude: configs/
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
      - id: codespell
-        exclude: >
+        exclude: |
            (?x)^(
-                .*\.jsonl
+                .*\.jsonl|
-            )$
+                configs/
            )
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
    hooks:
@ -33,17 +36,20 @@ repos:
        exclude: |
            (?x)^(
              dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
              configs/
            )
      - id: check-yaml
      - id: end-of-file-fixer
        exclude: |
            (?x)^(
              dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
              configs/
            )
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
        exclude: configs/
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
@ -60,11 +66,21 @@ repos:
          - mdformat-openmmlab
          - mdformat_frontmatter
          - linkify-it-py
        exclude: configs/
  - repo: https://github.com/myint/docformatter
    rev: v1.3.1
    hooks:
      - id: docformatter
        args: ["--in-place", "--wrap-descriptions", "79"]
  - repo: local
    hooks:
    -   id: update-dataset-suffix
        name: dataset suffix updater
        entry: ./tools/update_dataset_suffix.py
        language: script
        pass_filenames: true
        require_serial: true
        files: ^configs/datasets
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
--- a/configs/datasets/game24/game24_gen.py
+++ b/configs/datasets/game24/game24_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .game24_gen_8dfde3 import game24_datasets  # noqa: F401, F403
+    from .game24_gen_52a460 import game24_datasets  # noqa: F401, F403
--- a/configs/datasets/game24/game24_gen_52a460.py
+++ b/configs/datasets/game24/game24_gen_52a460.py
--- a/configs/datasets/humanevalx/humanevalx_gen.py
+++ b/configs/datasets/humanevalx/humanevalx_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .humanevalx_gen_fd5822 import humanevalx_datasets  # noqa: F401, F403
+    from .humanevalx_gen_620cfa import humanevalx_datasets  # noqa: F401, F403
--- a/configs/datasets/humanevalx/humanevalx_gen_620cfa.py
+++ b/configs/datasets/humanevalx/humanevalx_gen_620cfa.py
--- a/configs/datasets/z_bench/z_bench_gen.py
+++ b/configs/datasets/z_bench/z_bench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .z_bench_gen_5813ec import z_bench_dataset  # noqa: F401, F403
+    from .z_bench_gen_52ba2f import z_bench_datasets  # noqa: F401, F403
--- a/configs/datasets/z_bench/z_bench_gen_52ba2f.py
+++ b/configs/datasets/z_bench/z_bench_gen_52ba2f.py
@ -14,7 +14,7 @@ z_bench_infer_cfg = dict(
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
-z_bench_dataset = dict(
+z_bench_datasets = dict(
    type=HFDataset,
    path=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
--- a/configs/datasets/z_bench/z_bench_gen_d8c84c.py
+++ b/configs/datasets/z_bench/z_bench_gen_d8c84c.py
@ -17,7 +17,7 @@ z_bench_infer_cfg = dict(
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
-z_bench_dataset = dict(
+z_bench_datasets = dict(
    type=HFDataset,
    path=
    '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
--- a/docs/en/tools.md
+++ b/docs/en/tools.md
@ -69,3 +69,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
 ```
 - `-w`: Work path, default is `'./outputs/default'`.
 ## Dataset Suffix Updater
 This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash.
 How to run:
 ```bash
 python tools/update_dataset_suffix.py
 ```
--- a/docs/zh_cn/tools.md
+++ b/docs/zh_cn/tools.md
@ -78,3 +78,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
 ```
 - `-w`：工作路径，默认为 `'./outputs/default'`。
 ## Dataset Suffix Updater
 本工具可以快速修改 `configs/dataset` 目录下的配置文件后缀，使其符合提示词哈希命名规范。
 运行方式：
 ```bash
 python tools/update_dataset_suffix.py
 ```
--- a/tools/update_dataset_suffix.py
+++ b/tools/update_dataset_suffix.py
@ -0,0 +1,123 @@
 #!/usr/bin/env python3
 import argparse
 import glob
 import hashlib
 import json
 import os
 import re
 from multiprocessing import Pool
 from typing import List, Union
 from mmengine.config import Config, ConfigDict
 # from opencompass.utils import get_prompt_hash
 # copied from opencompass.utils.get_prompt_hash, for easy use in ci
 def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
    """Get the hash of the prompt configuration.
    Args:
        dataset_cfg (ConfigDict or list[ConfigDict]): The dataset
            configuration.
    Returns:
        str: The hash of the prompt configuration.
    """
    if isinstance(dataset_cfg, list):
        if len(dataset_cfg) == 1:
            dataset_cfg = dataset_cfg[0]
        else:
            hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg])
            hash_object = hashlib.sha256(hashes.encode())
            return hash_object.hexdigest()
    if 'reader_cfg' in dataset_cfg.infer_cfg:
        # new config
        reader_cfg = dict(type='DatasetReader',
                          input_columns=dataset_cfg.reader_cfg.input_columns,
                          output_column=dataset_cfg.reader_cfg.output_column)
        dataset_cfg.infer_cfg.reader = reader_cfg
        if 'train_split' in dataset_cfg.infer_cfg.reader_cfg:
            dataset_cfg.infer_cfg.retriever[
                'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][
                    'train_split']
        if 'test_split' in dataset_cfg.infer_cfg.reader_cfg:
            dataset_cfg.infer_cfg.retriever[
                'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split
        for k, v in dataset_cfg.infer_cfg.items():
            dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1]
    d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True)
    hash_object = hashlib.sha256(d_json.encode())
    return hash_object.hexdigest()
 # Assuming get_hash is a function that computes the hash of a file
 # from get_hash import get_hash
 def get_hash(path):
    cfg = Config.fromfile(path)
    for k in cfg.keys():
        if k.endswith('_datasets'):
            return get_prompt_hash(cfg[k])[:6]
    print(f'Could not find *_datasets in {path}')
    return None
 def check_and_rename(filepath):
    base_name = os.path.basename(filepath)
    match = re.match(r'(.*)_(gen|ppl)_(.*).py', base_name)
    if match:
        dataset, mode, old_hash = match.groups()
        new_hash = get_hash(filepath)
        if not new_hash:
            return None, None
        if old_hash != new_hash:
            new_name = f'{dataset}_{mode}_{new_hash}.py'
            new_file = os.path.join(os.path.dirname(filepath), new_name)
            print(f'Rename {filepath} to {new_file}')
            return filepath, new_file
    return None, None
 def update_imports(data):
    python_file, name_pairs = data
    for filepath, new_file in name_pairs:
        old_name = os.path.basename(filepath)[:-3]
        new_name = os.path.basename(new_file)[:-3]
        if not os.path.exists(python_file):
            return
        with open(python_file, 'r') as file:
            filedata = file.read()
        # Replace the old name with new name
        new_data = filedata.replace(old_name, new_name)
        if filedata != new_data:
            with open(python_file, 'w') as file:
                file.write(new_data)
            # print(f"Updated imports in {python_file}")
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('python_files', nargs='*')
    args = parser.parse_args()
    root_folder = 'configs/datasets'
    if args.python_files:
        python_files = [
            i for i in args.python_files if i.startswith(root_folder)
        ]
    else:
        python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
    # Use multiprocessing to speed up the check and rename process
    with Pool(16) as p:
        name_pairs = p.map(check_and_rename, python_files)
    name_pairs = [pair for pair in name_pairs if pair[0] is not None]
    with Pool(16) as p:
        p.starmap(os.rename, name_pairs)
    # python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
    update_data = [(python_file, name_pairs) for python_file in python_files]
    with Pool(16) as p:
        p.map(update_imports, update_data)
 if __name__ == '__main__':
    main()