mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add and apply update suffix tool (#280)
* add and apply update suffix tool * add dataset suffix updater as precommit hook * update workflow * update scripts * update ci * update * ci with py3.8 * run in serial * update bbh * use py 3.10 * update pre commit zh cn
This commit is contained in:
parent
b2d602f42b
commit
c26ecdb1b0
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
|||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
- name: Install pre-commit hook
|
- name: Install pre-commit hook
|
||||||
run: |
|
run: |
|
||||||
pip install pre-commit
|
pip install pre-commit mmengine
|
||||||
pre-commit install
|
pre-commit install
|
||||||
- name: Linting
|
- name: Linting
|
||||||
run: pre-commit run --all-files
|
run: pre-commit run --all-files
|
||||||
|
@ -2,30 +2,33 @@ exclude: |
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
tests/data/|
|
tests/data/|
|
||||||
opencompass/models/internal/|
|
opencompass/models/internal/|
|
||||||
opencompass/utils/internal/|
|
opencompass/utils/internal/
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
repos:
|
repos:
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-flake8
|
- repo: https://gitee.com/openmmlab/mirrors-flake8
|
||||||
rev: 5.0.4
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
|
exclude: configs/
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-isort
|
- repo: https://gitee.com/openmmlab/mirrors-isort
|
||||||
rev: 5.11.5
|
rev: 5.11.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
|
exclude: configs/
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-yapf
|
- repo: https://gitee.com/openmmlab/mirrors-yapf
|
||||||
rev: v0.32.0
|
rev: v0.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
|
exclude: configs/
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-codespell
|
- repo: https://gitee.com/openmmlab/mirrors-codespell
|
||||||
rev: v2.2.1
|
rev: v2.2.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
exclude: >
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
.*\.jsonl
|
.*\.jsonl|
|
||||||
)$
|
configs/
|
||||||
|
)
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
|
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
|
||||||
rev: v4.3.0
|
rev: v4.3.0
|
||||||
hooks:
|
hooks:
|
||||||
@ -33,17 +36,20 @@ repos:
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/
|
projects/.*?/dicts/|
|
||||||
|
configs/
|
||||||
)
|
)
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/
|
projects/.*?/dicts/|
|
||||||
|
configs/
|
||||||
)
|
)
|
||||||
- id: requirements-txt-fixer
|
- id: requirements-txt-fixer
|
||||||
- id: double-quote-string-fixer
|
- id: double-quote-string-fixer
|
||||||
|
exclude: configs/
|
||||||
- id: check-merge-conflict
|
- id: check-merge-conflict
|
||||||
- id: fix-encoding-pragma
|
- id: fix-encoding-pragma
|
||||||
args: ["--remove"]
|
args: ["--remove"]
|
||||||
@ -60,11 +66,21 @@ repos:
|
|||||||
- mdformat-openmmlab
|
- mdformat-openmmlab
|
||||||
- mdformat_frontmatter
|
- mdformat_frontmatter
|
||||||
- linkify-it-py
|
- linkify-it-py
|
||||||
|
exclude: configs/
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-docformatter
|
- repo: https://gitee.com/openmmlab/mirrors-docformatter
|
||||||
rev: v1.3.1
|
rev: v1.3.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: docformatter
|
- id: docformatter
|
||||||
args: ["--in-place", "--wrap-descriptions", "79"]
|
args: ["--in-place", "--wrap-descriptions", "79"]
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: update-dataset-suffix
|
||||||
|
name: dataset suffix updater
|
||||||
|
entry: ./tools/update_dataset_suffix.py
|
||||||
|
language: script
|
||||||
|
pass_filenames: true
|
||||||
|
require_serial: true
|
||||||
|
files: ^configs/datasets
|
||||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||||
# rev: v0.2.0 # Use the ref you want to point at
|
# rev: v0.2.0 # Use the ref you want to point at
|
||||||
# hooks:
|
# hooks:
|
||||||
|
@ -2,30 +2,33 @@ exclude: |
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
tests/data/|
|
tests/data/|
|
||||||
opencompass/models/internal/|
|
opencompass/models/internal/|
|
||||||
opencompass/utils/internal/|
|
opencompass/utils/internal/
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 5.0.4
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
|
exclude: configs/
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 5.11.5
|
rev: 5.11.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
|
exclude: configs/
|
||||||
- repo: https://github.com/pre-commit/mirrors-yapf
|
- repo: https://github.com/pre-commit/mirrors-yapf
|
||||||
rev: v0.32.0
|
rev: v0.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
|
exclude: configs/
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.2.1
|
rev: v2.2.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
exclude: >
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
.*\.jsonl
|
.*\.jsonl|
|
||||||
)$
|
configs/
|
||||||
|
)
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.3.0
|
rev: v4.3.0
|
||||||
hooks:
|
hooks:
|
||||||
@ -33,17 +36,20 @@ repos:
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/
|
projects/.*?/dicts/|
|
||||||
|
configs/
|
||||||
)
|
)
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/
|
projects/.*?/dicts/|
|
||||||
|
configs/
|
||||||
)
|
)
|
||||||
- id: requirements-txt-fixer
|
- id: requirements-txt-fixer
|
||||||
- id: double-quote-string-fixer
|
- id: double-quote-string-fixer
|
||||||
|
exclude: configs/
|
||||||
- id: check-merge-conflict
|
- id: check-merge-conflict
|
||||||
- id: fix-encoding-pragma
|
- id: fix-encoding-pragma
|
||||||
args: ["--remove"]
|
args: ["--remove"]
|
||||||
@ -60,11 +66,21 @@ repos:
|
|||||||
- mdformat-openmmlab
|
- mdformat-openmmlab
|
||||||
- mdformat_frontmatter
|
- mdformat_frontmatter
|
||||||
- linkify-it-py
|
- linkify-it-py
|
||||||
|
exclude: configs/
|
||||||
- repo: https://github.com/myint/docformatter
|
- repo: https://github.com/myint/docformatter
|
||||||
rev: v1.3.1
|
rev: v1.3.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: docformatter
|
- id: docformatter
|
||||||
args: ["--in-place", "--wrap-descriptions", "79"]
|
args: ["--in-place", "--wrap-descriptions", "79"]
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: update-dataset-suffix
|
||||||
|
name: dataset suffix updater
|
||||||
|
entry: ./tools/update_dataset_suffix.py
|
||||||
|
language: script
|
||||||
|
pass_filenames: true
|
||||||
|
require_serial: true
|
||||||
|
files: ^configs/datasets
|
||||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||||
# rev: v0.2.0 # Use the ref you want to point at
|
# rev: v0.2.0 # Use the ref you want to point at
|
||||||
# hooks:
|
# hooks:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .game24_gen_8dfde3 import game24_datasets # noqa: F401, F403
|
from .game24_gen_52a460 import game24_datasets # noqa: F401, F403
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .humanevalx_gen_fd5822 import humanevalx_datasets # noqa: F401, F403
|
from .humanevalx_gen_620cfa import humanevalx_datasets # noqa: F401, F403
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .z_bench_gen_5813ec import z_bench_dataset # noqa: F401, F403
|
from .z_bench_gen_52ba2f import z_bench_datasets # noqa: F401, F403
|
||||||
|
@ -14,7 +14,7 @@ z_bench_infer_cfg = dict(
|
|||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
z_bench_dataset = dict(
|
z_bench_datasets = dict(
|
||||||
type=HFDataset,
|
type=HFDataset,
|
||||||
path=
|
path=
|
||||||
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
|
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
|
@ -17,7 +17,7 @@ z_bench_infer_cfg = dict(
|
|||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
z_bench_dataset = dict(
|
z_bench_datasets = dict(
|
||||||
type=HFDataset,
|
type=HFDataset,
|
||||||
path=
|
path=
|
||||||
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
|
'/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench',
|
@ -69,3 +69,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
|
|||||||
```
|
```
|
||||||
|
|
||||||
- `-w`: Work path, default is `'./outputs/default'`.
|
- `-w`: Work path, default is `'./outputs/default'`.
|
||||||
|
|
||||||
|
## Dataset Suffix Updater
|
||||||
|
|
||||||
|
This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash.
|
||||||
|
|
||||||
|
How to run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python tools/update_dataset_suffix.py
|
||||||
|
```
|
||||||
|
@ -78,3 +78,13 @@ python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
|
|||||||
```
|
```
|
||||||
|
|
||||||
- `-w`:工作路径,默认为 `'./outputs/default'`。
|
- `-w`:工作路径,默认为 `'./outputs/default'`。
|
||||||
|
|
||||||
|
## Dataset Suffix Updater
|
||||||
|
|
||||||
|
本工具可以快速修改 `configs/dataset` 目录下的配置文件后缀,使其符合提示词哈希命名规范。
|
||||||
|
|
||||||
|
运行方式:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python tools/update_dataset_suffix.py
|
||||||
|
```
|
||||||
|
123
tools/update_dataset_suffix.py
Executable file
123
tools/update_dataset_suffix.py
Executable file
@ -0,0 +1,123 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
from mmengine.config import Config, ConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
# from opencompass.utils import get_prompt_hash
|
||||||
|
# copied from opencompass.utils.get_prompt_hash, for easy use in ci
|
||||||
|
def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
|
||||||
|
"""Get the hash of the prompt configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset_cfg (ConfigDict or list[ConfigDict]): The dataset
|
||||||
|
configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The hash of the prompt configuration.
|
||||||
|
"""
|
||||||
|
if isinstance(dataset_cfg, list):
|
||||||
|
if len(dataset_cfg) == 1:
|
||||||
|
dataset_cfg = dataset_cfg[0]
|
||||||
|
else:
|
||||||
|
hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg])
|
||||||
|
hash_object = hashlib.sha256(hashes.encode())
|
||||||
|
return hash_object.hexdigest()
|
||||||
|
if 'reader_cfg' in dataset_cfg.infer_cfg:
|
||||||
|
# new config
|
||||||
|
reader_cfg = dict(type='DatasetReader',
|
||||||
|
input_columns=dataset_cfg.reader_cfg.input_columns,
|
||||||
|
output_column=dataset_cfg.reader_cfg.output_column)
|
||||||
|
dataset_cfg.infer_cfg.reader = reader_cfg
|
||||||
|
if 'train_split' in dataset_cfg.infer_cfg.reader_cfg:
|
||||||
|
dataset_cfg.infer_cfg.retriever[
|
||||||
|
'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][
|
||||||
|
'train_split']
|
||||||
|
if 'test_split' in dataset_cfg.infer_cfg.reader_cfg:
|
||||||
|
dataset_cfg.infer_cfg.retriever[
|
||||||
|
'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split
|
||||||
|
for k, v in dataset_cfg.infer_cfg.items():
|
||||||
|
dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1]
|
||||||
|
d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True)
|
||||||
|
hash_object = hashlib.sha256(d_json.encode())
|
||||||
|
return hash_object.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# Assuming get_hash is a function that computes the hash of a file
|
||||||
|
# from get_hash import get_hash
|
||||||
|
def get_hash(path):
|
||||||
|
cfg = Config.fromfile(path)
|
||||||
|
for k in cfg.keys():
|
||||||
|
if k.endswith('_datasets'):
|
||||||
|
return get_prompt_hash(cfg[k])[:6]
|
||||||
|
print(f'Could not find *_datasets in {path}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_and_rename(filepath):
|
||||||
|
base_name = os.path.basename(filepath)
|
||||||
|
match = re.match(r'(.*)_(gen|ppl)_(.*).py', base_name)
|
||||||
|
if match:
|
||||||
|
dataset, mode, old_hash = match.groups()
|
||||||
|
new_hash = get_hash(filepath)
|
||||||
|
if not new_hash:
|
||||||
|
return None, None
|
||||||
|
if old_hash != new_hash:
|
||||||
|
new_name = f'{dataset}_{mode}_{new_hash}.py'
|
||||||
|
new_file = os.path.join(os.path.dirname(filepath), new_name)
|
||||||
|
print(f'Rename {filepath} to {new_file}')
|
||||||
|
return filepath, new_file
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def update_imports(data):
|
||||||
|
python_file, name_pairs = data
|
||||||
|
for filepath, new_file in name_pairs:
|
||||||
|
old_name = os.path.basename(filepath)[:-3]
|
||||||
|
new_name = os.path.basename(new_file)[:-3]
|
||||||
|
if not os.path.exists(python_file):
|
||||||
|
return
|
||||||
|
with open(python_file, 'r') as file:
|
||||||
|
filedata = file.read()
|
||||||
|
# Replace the old name with new name
|
||||||
|
new_data = filedata.replace(old_name, new_name)
|
||||||
|
if filedata != new_data:
|
||||||
|
with open(python_file, 'w') as file:
|
||||||
|
file.write(new_data)
|
||||||
|
# print(f"Updated imports in {python_file}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('python_files', nargs='*')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
root_folder = 'configs/datasets'
|
||||||
|
if args.python_files:
|
||||||
|
python_files = [
|
||||||
|
i for i in args.python_files if i.startswith(root_folder)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
|
||||||
|
|
||||||
|
# Use multiprocessing to speed up the check and rename process
|
||||||
|
with Pool(16) as p:
|
||||||
|
name_pairs = p.map(check_and_rename, python_files)
|
||||||
|
name_pairs = [pair for pair in name_pairs if pair[0] is not None]
|
||||||
|
with Pool(16) as p:
|
||||||
|
p.starmap(os.rename, name_pairs)
|
||||||
|
# python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
|
||||||
|
update_data = [(python_file, name_pairs) for python_file in python_files]
|
||||||
|
with Pool(16) as p:
|
||||||
|
p.map(update_imports, update_data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user