From c26ecdb1b05baea7bcf34c99ea245ae68a3ada83 Mon Sep 17 00:00:00 2001
From: Leymore <zfz-960727@163.com>
Date: Mon, 28 Aug 2023 17:35:04 +0800
Subject: [PATCH] [Feature] Add and apply update suffix tool (#280)

* add and apply update suffix tool

* add dataset suffix updater as precommit hook

* update workflow

* update scripts

* update ci

* update

* ci with py3.8

* run in serial

* update bbh

* use py 3.10

* update pre commit zh cn
---
 .github/workflows/lint.yml                    |   2 +-
 .pre-commit-config-zh-cn.yaml                 |  30 ++++-
 .pre-commit-config.yaml                       |  30 ++++-
 configs/datasets/game24/game24_gen.py         |   2 +-
 ...e24_gen_8dfde3.py => game24_gen_52a460.py} |   0
 configs/datasets/humanevalx/humanevalx_gen.py |   2 +-
 ...gen_fd5822.py => humanevalx_gen_620cfa.py} |   0
 configs/datasets/z_bench/z_bench_gen.py       |   2 +-
 ...ch_gen_5813ec.py => z_bench_gen_52ba2f.py} |   2 +-
 ...ch_gen_61db0a.py => z_bench_gen_d8c84c.py} |   2 +-
 docs/en/tools.md                              |  10 ++
 docs/zh_cn/tools.md                           |  10 ++
 tools/update_dataset_suffix.py                | 123 ++++++++++++++++++
 13 files changed, 195 insertions(+), 20 deletions(-)
 rename configs/datasets/game24/{game24_gen_8dfde3.py => game24_gen_52a460.py} (100%)
 rename configs/datasets/humanevalx/{humanevalx_gen_fd5822.py => humanevalx_gen_620cfa.py} (100%)
 rename configs/datasets/z_bench/{z_bench_gen_5813ec.py => z_bench_gen_52ba2f.py} (97%)
 rename configs/datasets/z_bench/{z_bench_gen_61db0a.py => z_bench_gen_d8c84c.py} (97%)
 create mode 100755 tools/update_dataset_suffix.py

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7d1d2aac..ae9a9bd2 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -17,7 +17,7 @@ jobs:
           python-version: '3.10'
       - name: Install pre-commit hook
         run: |
-          pip install pre-commit
+          pip install pre-commit mmengine
           pre-commit install
       - name: Linting
         run: pre-commit run --all-files
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
index d5aff2c5..a464f527 100644
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@@ -2,30 +2,33 @@ exclude: |
     (?x)^(
       tests/data/|
       opencompass/models/internal/|
-      opencompass/utils/internal/|
-      configs/
+      opencompass/utils/internal/
     )
 repos:
   - repo: https://gitee.com/openmmlab/mirrors-flake8
     rev: 5.0.4
     hooks:
       - id: flake8
+        exclude: configs/
   - repo: https://gitee.com/openmmlab/mirrors-isort
     rev: 5.11.5
     hooks:
       - id: isort
+        exclude: configs/
   - repo: https://gitee.com/openmmlab/mirrors-yapf
     rev: v0.32.0
     hooks:
       - id: yapf
+        exclude: configs/
   - repo: https://gitee.com/openmmlab/mirrors-codespell
     rev: v2.2.1
     hooks:
       - id: codespell
-        exclude: >
+        exclude: |
             (?x)^(
-                .*\.jsonl
-            )$
+                .*\.jsonl|
+                configs/
+            )
   - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
     rev: v4.3.0
     hooks:
@@ -33,17 +36,20 @@ repos:
         exclude: |
             (?x)^(
               dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
+              configs/
             )
       - id: check-yaml
       - id: end-of-file-fixer
         exclude: |
             (?x)^(
               dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
+              configs/
             )
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
+        exclude: configs/
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: ["--remove"]
@@ -60,11 +66,21 @@ repos:
           - mdformat-openmmlab
           - mdformat_frontmatter
           - linkify-it-py
+        exclude: configs/
   - repo: https://gitee.com/openmmlab/mirrors-docformatter
     rev: v1.3.1
     hooks:
       - id: docformatter
         args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: local
+    hooks:
+    -   id: update-dataset-suffix
+        name: dataset suffix updater
+        entry: ./tools/update_dataset_suffix.py
+        language: script
+        pass_filenames: true
+        require_serial: true
+        files: ^configs/datasets
   # - repo: https://github.com/open-mmlab/pre-commit-hooks
   #   rev: v0.2.0  # Use the ref you want to point at
   #   hooks:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 187ea5fc..8158381e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,30 +2,33 @@ exclude: |
     (?x)^(
       tests/data/|
       opencompass/models/internal/|
-      opencompass/utils/internal/|
-      configs/
+      opencompass/utils/internal/
     )
 repos:
   - repo: https://github.com/PyCQA/flake8
     rev: 5.0.4
     hooks:
       - id: flake8
+        exclude: configs/
   - repo: https://github.com/PyCQA/isort
     rev: 5.11.5
     hooks:
       - id: isort
+        exclude: configs/
   - repo: https://github.com/pre-commit/mirrors-yapf
     rev: v0.32.0
     hooks:
       - id: yapf
+        exclude: configs/
   - repo: https://github.com/codespell-project/codespell
     rev: v2.2.1
     hooks:
       - id: codespell
-        exclude: >
+        exclude: |
             (?x)^(
-                .*\.jsonl
-            )$
+                .*\.jsonl|
+                configs/
+            )
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
     hooks:
@@ -33,17 +36,20 @@ repos:
         exclude: |
             (?x)^(
               dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
+              configs/
             )
       - id: check-yaml
       - id: end-of-file-fixer
         exclude: |
             (?x)^(
               dicts/|
-              projects/.*?/dicts/
+              projects/.*?/dicts/|
+              configs/
             )
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
+        exclude: configs/
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: ["--remove"]
@@ -60,11 +66,21 @@ repos:
           - mdformat-openmmlab
           - mdformat_frontmatter
           - linkify-it-py
+        exclude: configs/
   - repo: https://github.com/myint/docformatter
     rev: v1.3.1
     hooks:
       - id: docformatter
         args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: local
+    hooks:
+    -   id: update-dataset-suffix
+        name: dataset suffix updater
+        entry: ./tools/update_dataset_suffix.py
+        language: script
+        pass_filenames: true
+        require_serial: true
+        files: ^configs/datasets
   # - repo: https://github.com/open-mmlab/pre-commit-hooks
   #   rev: v0.2.0  # Use the ref you want to point at
   #   hooks:
diff --git a/configs/datasets/game24/game24_gen.py b/configs/datasets/game24/game24_gen.py
index 89de8559..25defc80 100644
--- a/configs/datasets/game24/game24_gen.py
+++ b/configs/datasets/game24/game24_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .game24_gen_8dfde3 import game24_datasets  # noqa: F401, F403
+    from .game24_gen_52a460 import game24_datasets  # noqa: F401, F403
diff --git a/configs/datasets/game24/game24_gen_8dfde3.py b/configs/datasets/game24/game24_gen_52a460.py
similarity index 100%
rename from configs/datasets/game24/game24_gen_8dfde3.py
rename to configs/datasets/game24/game24_gen_52a460.py
diff --git a/configs/datasets/humanevalx/humanevalx_gen.py b/configs/datasets/humanevalx/humanevalx_gen.py
index ac2b34e7..52238a17 100644
--- a/configs/datasets/humanevalx/humanevalx_gen.py
+++ b/configs/datasets/humanevalx/humanevalx_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .humanevalx_gen_fd5822 import humanevalx_datasets  # noqa: F401, F403
\ No newline at end of file
+    from .humanevalx_gen_620cfa import humanevalx_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/configs/datasets/humanevalx/humanevalx_gen_fd5822.py b/configs/datasets/humanevalx/humanevalx_gen_620cfa.py
similarity index 100%
rename from configs/datasets/humanevalx/humanevalx_gen_fd5822.py
rename to configs/datasets/humanevalx/humanevalx_gen_620cfa.py
diff --git a/configs/datasets/z_bench/z_bench_gen.py b/configs/datasets/z_bench/z_bench_gen.py
index 5d71997a..a30a1a12 100644
--- a/configs/datasets/z_bench/z_bench_gen.py
+++ b/configs/datasets/z_bench/z_bench_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .z_bench_gen_5813ec import z_bench_dataset  # noqa: F401, F403
+    from .z_bench_gen_52ba2f import z_bench_datasets  # noqa: F401, F403
diff --git a/configs/datasets/z_bench/z_bench_gen_5813ec.py b/configs/datasets/z_bench/z_bench_gen_52ba2f.py
similarity index 97%
rename from configs/datasets/z_bench/z_bench_gen_5813ec.py
rename to configs/datasets/z_bench/z_bench_gen_52ba2f.py
index 49077976..39e3f1f9 100644
--- a/configs/datasets/z_bench/z_bench_gen_5813ec.py
+++ b/configs/datasets/z_bench/z_bench_gen_52ba2f.py
@@ -14,7 +14,7 @@ z_bench_infer_cfg = dict(
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=GenInferencer))
 
-z_bench_dataset = dict(
+z_bench_datasets = dict(
     type=HFDataset,
     path=
     '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
diff --git a/configs/datasets/z_bench/z_bench_gen_61db0a.py b/configs/datasets/z_bench/z_bench_gen_d8c84c.py
similarity index 97%
rename from configs/datasets/z_bench/z_bench_gen_61db0a.py
rename to configs/datasets/z_bench/z_bench_gen_d8c84c.py
index 63cfded8..33149155 100644
--- a/configs/datasets/z_bench/z_bench_gen_61db0a.py
+++ b/configs/datasets/z_bench/z_bench_gen_d8c84c.py
@@ -17,7 +17,7 @@ z_bench_infer_cfg = dict(
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=GenInferencer))
 
-z_bench_dataset = dict(
+z_bench_datasets = dict(
     type=HFDataset,
     path=
     '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
diff --git a/docs/en/tools.md b/docs/en/tools.md
index 28872285..51438f9e 100644
--- a/docs/en/tools.md
+++ b/docs/en/tools.md
@@ -69,3 +69,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
 ```
 
 - `-w`: Work path, default is `'./outputs/default'`.
+
+## Dataset Suffix Updater
+
+This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash.
+
+How to run:
+
+```bash
+python tools/update_dataset_suffix.py
+```
diff --git a/docs/zh_cn/tools.md b/docs/zh_cn/tools.md
index ffb371b1..4a9c7bb6 100644
--- a/docs/zh_cn/tools.md
+++ b/docs/zh_cn/tools.md
@@ -78,3 +78,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
 ```
 
 - `-w`：工作路径，默认为 `'./outputs/default'`。
+
+## Dataset Suffix Updater
+
+本工具可以快速修改 `configs/dataset` 目录下的配置文件后缀，使其符合提示词哈希命名规范。
+
+运行方式：
+
+```bash
+python tools/update_dataset_suffix.py
+```
diff --git a/tools/update_dataset_suffix.py b/tools/update_dataset_suffix.py
new file mode 100755
index 00000000..4bf3c9a3
--- /dev/null
+++ b/tools/update_dataset_suffix.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+import argparse
+import glob
+import hashlib
+import json
+import os
+import re
+from multiprocessing import Pool
+from typing import List, Union
+
+from mmengine.config import Config, ConfigDict
+
+
+# from opencompass.utils import get_prompt_hash
+# copied from opencompass.utils.get_prompt_hash, for easy use in ci
+def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
+    """Get the hash of the prompt configuration.
+
+    Args:
+        dataset_cfg (ConfigDict or list[ConfigDict]): The dataset
+            configuration.
+
+    Returns:
+        str: The hash of the prompt configuration.
+    """
+    if isinstance(dataset_cfg, list):
+        if len(dataset_cfg) == 1:
+            dataset_cfg = dataset_cfg[0]
+        else:
+            hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg])
+            hash_object = hashlib.sha256(hashes.encode())
+            return hash_object.hexdigest()
+    if 'reader_cfg' in dataset_cfg.infer_cfg:
+        # new config
+        reader_cfg = dict(type='DatasetReader',
+                          input_columns=dataset_cfg.reader_cfg.input_columns,
+                          output_column=dataset_cfg.reader_cfg.output_column)
+        dataset_cfg.infer_cfg.reader = reader_cfg
+        if 'train_split' in dataset_cfg.infer_cfg.reader_cfg:
+            dataset_cfg.infer_cfg.retriever[
+                'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][
+                    'train_split']
+        if 'test_split' in dataset_cfg.infer_cfg.reader_cfg:
+            dataset_cfg.infer_cfg.retriever[
+                'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split
+        for k, v in dataset_cfg.infer_cfg.items():
+            dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1]
+    d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True)
+    hash_object = hashlib.sha256(d_json.encode())
+    return hash_object.hexdigest()
+
+
+# Assuming get_hash is a function that computes the hash of a file
+# from get_hash import get_hash
+def get_hash(path):
+    cfg = Config.fromfile(path)
+    for k in cfg.keys():
+        if k.endswith('_datasets'):
+            return get_prompt_hash(cfg[k])[:6]
+    print(f'Could not find *_datasets in {path}')
+    return None
+
+
+def check_and_rename(filepath):
+    base_name = os.path.basename(filepath)
+    match = re.match(r'(.*)_(gen|ppl)_(.*).py', base_name)
+    if match:
+        dataset, mode, old_hash = match.groups()
+        new_hash = get_hash(filepath)
+        if not new_hash:
+            return None, None
+        if old_hash != new_hash:
+            new_name = f'{dataset}_{mode}_{new_hash}.py'
+            new_file = os.path.join(os.path.dirname(filepath), new_name)
+            print(f'Rename {filepath} to {new_file}')
+            return filepath, new_file
+    return None, None
+
+
+def update_imports(data):
+    python_file, name_pairs = data
+    for filepath, new_file in name_pairs:
+        old_name = os.path.basename(filepath)[:-3]
+        new_name = os.path.basename(new_file)[:-3]
+        if not os.path.exists(python_file):
+            return
+        with open(python_file, 'r') as file:
+            filedata = file.read()
+        # Replace the old name with new name
+        new_data = filedata.replace(old_name, new_name)
+        if filedata != new_data:
+            with open(python_file, 'w') as file:
+                file.write(new_data)
+            # print(f"Updated imports in {python_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('python_files', nargs='*')
+    args = parser.parse_args()
+
+    root_folder = 'configs/datasets'
+    if args.python_files:
+        python_files = [
+            i for i in args.python_files if i.startswith(root_folder)
+        ]
+    else:
+        python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
+
+    # Use multiprocessing to speed up the check and rename process
+    with Pool(16) as p:
+        name_pairs = p.map(check_and_rename, python_files)
+    name_pairs = [pair for pair in name_pairs if pair[0] is not None]
+    with Pool(16) as p:
+        p.starmap(os.rename, name_pairs)
+    # python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
+    update_data = [(python_file, name_pairs) for python_file in python_files]
+    with Pool(16) as p:
+        p.map(update_imports, update_data)
+
+
+if __name__ == '__main__':
+    main()