[Feature] Add and apply update suffix tool (#280)

* add and apply update suffix tool

* add dataset suffix updater as precommit hook

* update workflow

* update scripts

* update ci

* update

* ci with py3.8

* run in serial

* update bbh

* use py 3.10

* update pre commit zh cn
This commit is contained in:
Leymore 2023-08-28 17:35:04 +08:00 committed by GitHub
parent b2d602f42b
commit c26ecdb1b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 195 additions and 20 deletions

View File

@ -17,7 +17,7 @@ jobs:
python-version: '3.10' python-version: '3.10'
- name: Install pre-commit hook - name: Install pre-commit hook
run: | run: |
pip install pre-commit pip install pre-commit mmengine
pre-commit install pre-commit install
- name: Linting - name: Linting
run: pre-commit run --all-files run: pre-commit run --all-files

View File

@ -2,30 +2,33 @@ exclude: |
(?x)^( (?x)^(
tests/data/| tests/data/|
opencompass/models/internal/| opencompass/models/internal/|
opencompass/utils/internal/| opencompass/utils/internal/
configs/
) )
repos: repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8 - repo: https://gitee.com/openmmlab/mirrors-flake8
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
exclude: configs/
- repo: https://gitee.com/openmmlab/mirrors-isort - repo: https://gitee.com/openmmlab/mirrors-isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
- id: isort - id: isort
exclude: configs/
- repo: https://gitee.com/openmmlab/mirrors-yapf - repo: https://gitee.com/openmmlab/mirrors-yapf
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
exclude: configs/
- repo: https://gitee.com/openmmlab/mirrors-codespell - repo: https://gitee.com/openmmlab/mirrors-codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
- id: codespell - id: codespell
exclude: > exclude: |
(?x)^( (?x)^(
.*\.jsonl .*\.jsonl|
)$ configs/
)
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
rev: v4.3.0 rev: v4.3.0
hooks: hooks:
@ -33,17 +36,20 @@ repos:
exclude: | exclude: |
(?x)^( (?x)^(
dicts/| dicts/|
projects/.*?/dicts/ projects/.*?/dicts/|
configs/
) )
- id: check-yaml - id: check-yaml
- id: end-of-file-fixer - id: end-of-file-fixer
exclude: | exclude: |
(?x)^( (?x)^(
dicts/| dicts/|
projects/.*?/dicts/ projects/.*?/dicts/|
configs/
) )
- id: requirements-txt-fixer - id: requirements-txt-fixer
- id: double-quote-string-fixer - id: double-quote-string-fixer
exclude: configs/
- id: check-merge-conflict - id: check-merge-conflict
- id: fix-encoding-pragma - id: fix-encoding-pragma
args: ["--remove"] args: ["--remove"]
@ -60,11 +66,21 @@ repos:
- mdformat-openmmlab - mdformat-openmmlab
- mdformat_frontmatter - mdformat_frontmatter
- linkify-it-py - linkify-it-py
exclude: configs/
- repo: https://gitee.com/openmmlab/mirrors-docformatter - repo: https://gitee.com/openmmlab/mirrors-docformatter
rev: v1.3.1 rev: v1.3.1
hooks: hooks:
- id: docformatter - id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"] args: ["--in-place", "--wrap-descriptions", "79"]
- repo: local
hooks:
- id: update-dataset-suffix
name: dataset suffix updater
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: true
require_serial: true
files: ^configs/datasets
# - repo: https://github.com/open-mmlab/pre-commit-hooks # - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at # rev: v0.2.0 # Use the ref you want to point at
# hooks: # hooks:

View File

@ -2,30 +2,33 @@ exclude: |
(?x)^( (?x)^(
tests/data/| tests/data/|
opencompass/models/internal/| opencompass/models/internal/|
opencompass/utils/internal/| opencompass/utils/internal/
configs/
) )
repos: repos:
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
exclude: configs/
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
- id: isort - id: isort
exclude: configs/
- repo: https://github.com/pre-commit/mirrors-yapf - repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
exclude: configs/
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
- id: codespell - id: codespell
exclude: > exclude: |
(?x)^( (?x)^(
.*\.jsonl .*\.jsonl|
)$ configs/
)
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0 rev: v4.3.0
hooks: hooks:
@ -33,17 +36,20 @@ repos:
exclude: | exclude: |
(?x)^( (?x)^(
dicts/| dicts/|
projects/.*?/dicts/ projects/.*?/dicts/|
configs/
) )
- id: check-yaml - id: check-yaml
- id: end-of-file-fixer - id: end-of-file-fixer
exclude: | exclude: |
(?x)^( (?x)^(
dicts/| dicts/|
projects/.*?/dicts/ projects/.*?/dicts/|
configs/
) )
- id: requirements-txt-fixer - id: requirements-txt-fixer
- id: double-quote-string-fixer - id: double-quote-string-fixer
exclude: configs/
- id: check-merge-conflict - id: check-merge-conflict
- id: fix-encoding-pragma - id: fix-encoding-pragma
args: ["--remove"] args: ["--remove"]
@ -60,11 +66,21 @@ repos:
- mdformat-openmmlab - mdformat-openmmlab
- mdformat_frontmatter - mdformat_frontmatter
- linkify-it-py - linkify-it-py
exclude: configs/
- repo: https://github.com/myint/docformatter - repo: https://github.com/myint/docformatter
rev: v1.3.1 rev: v1.3.1
hooks: hooks:
- id: docformatter - id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"] args: ["--in-place", "--wrap-descriptions", "79"]
- repo: local
hooks:
- id: update-dataset-suffix
name: dataset suffix updater
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: true
require_serial: true
files: ^configs/datasets
# - repo: https://github.com/open-mmlab/pre-commit-hooks # - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at # rev: v0.2.0 # Use the ref you want to point at
# hooks: # hooks:

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .game24_gen_8dfde3 import game24_datasets # noqa: F401, F403 from .game24_gen_52a460 import game24_datasets # noqa: F401, F403

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .humanevalx_gen_fd5822 import humanevalx_datasets # noqa: F401, F403 from .humanevalx_gen_620cfa import humanevalx_datasets # noqa: F401, F403

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .z_bench_gen_5813ec import z_bench_dataset # noqa: F401, F403 from .z_bench_gen_52ba2f import z_bench_datasets # noqa: F401, F403

View File

@ -14,7 +14,7 @@ z_bench_infer_cfg = dict(
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)) inferencer=dict(type=GenInferencer))
z_bench_dataset = dict( z_bench_datasets = dict(
type=HFDataset, type=HFDataset,
path= path=
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',

View File

@ -17,7 +17,7 @@ z_bench_infer_cfg = dict(
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)) inferencer=dict(type=GenInferencer))
z_bench_dataset = dict( z_bench_datasets = dict(
type=HFDataset, type=HFDataset,
path= path=
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',

View File

@ -69,3 +69,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
``` ```
- `-w`: Work path, default is `'./outputs/default'`. - `-w`: Work path, default is `'./outputs/default'`.
## Dataset Suffix Updater
This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash.
How to run:
```bash
python tools/update_dataset_suffix.py
```

View File

@ -78,3 +78,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
``` ```
- `-w`:工作路径,默认为 `'./outputs/default'` - `-w`:工作路径,默认为 `'./outputs/default'`
## Dataset Suffix Updater
本工具可以快速修改 `configs/dataset` 目录下的配置文件后缀,使其符合提示词哈希命名规范。
运行方式:
```bash
python tools/update_dataset_suffix.py
```

123
tools/update_dataset_suffix.py Executable file
View File

@ -0,0 +1,123 @@
#!/usr/bin/env python3
import argparse
import glob
import hashlib
import json
import os
import re
from multiprocessing import Pool
from typing import List, Union
from mmengine.config import Config, ConfigDict
# from opencompass.utils import get_prompt_hash
# copied from opencompass.utils.get_prompt_hash, for easy use in ci
def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
"""Get the hash of the prompt configuration.
Args:
dataset_cfg (ConfigDict or list[ConfigDict]): The dataset
configuration.
Returns:
str: The hash of the prompt configuration.
"""
if isinstance(dataset_cfg, list):
if len(dataset_cfg) == 1:
dataset_cfg = dataset_cfg[0]
else:
hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg])
hash_object = hashlib.sha256(hashes.encode())
return hash_object.hexdigest()
if 'reader_cfg' in dataset_cfg.infer_cfg:
# new config
reader_cfg = dict(type='DatasetReader',
input_columns=dataset_cfg.reader_cfg.input_columns,
output_column=dataset_cfg.reader_cfg.output_column)
dataset_cfg.infer_cfg.reader = reader_cfg
if 'train_split' in dataset_cfg.infer_cfg.reader_cfg:
dataset_cfg.infer_cfg.retriever[
'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][
'train_split']
if 'test_split' in dataset_cfg.infer_cfg.reader_cfg:
dataset_cfg.infer_cfg.retriever[
'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split
for k, v in dataset_cfg.infer_cfg.items():
dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1]
d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True)
hash_object = hashlib.sha256(d_json.encode())
return hash_object.hexdigest()
# Assuming get_hash is a function that computes the hash of a file
# from get_hash import get_hash
def get_hash(path):
cfg = Config.fromfile(path)
for k in cfg.keys():
if k.endswith('_datasets'):
return get_prompt_hash(cfg[k])[:6]
print(f'Could not find *_datasets in {path}')
return None
def check_and_rename(filepath):
base_name = os.path.basename(filepath)
match = re.match(r'(.*)_(gen|ppl)_(.*).py', base_name)
if match:
dataset, mode, old_hash = match.groups()
new_hash = get_hash(filepath)
if not new_hash:
return None, None
if old_hash != new_hash:
new_name = f'{dataset}_{mode}_{new_hash}.py'
new_file = os.path.join(os.path.dirname(filepath), new_name)
print(f'Rename {filepath} to {new_file}')
return filepath, new_file
return None, None
def update_imports(data):
python_file, name_pairs = data
for filepath, new_file in name_pairs:
old_name = os.path.basename(filepath)[:-3]
new_name = os.path.basename(new_file)[:-3]
if not os.path.exists(python_file):
return
with open(python_file, 'r') as file:
filedata = file.read()
# Replace the old name with new name
new_data = filedata.replace(old_name, new_name)
if filedata != new_data:
with open(python_file, 'w') as file:
file.write(new_data)
# print(f"Updated imports in {python_file}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument('python_files', nargs='*')
args = parser.parse_args()
root_folder = 'configs/datasets'
if args.python_files:
python_files = [
i for i in args.python_files if i.startswith(root_folder)
]
else:
python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
# Use multiprocessing to speed up the check and rename process
with Pool(16) as p:
name_pairs = p.map(check_and_rename, python_files)
name_pairs = [pair for pair in name_pairs if pair[0] is not None]
with Pool(16) as p:
p.starmap(os.rename, name_pairs)
# python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
update_data = [(python_file, name_pairs) for python_file in python_files]
with Pool(16) as p:
p.map(update_imports, update_data)
if __name__ == '__main__':
main()