mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Doc] Update logo icon (#32)
* update logo_icon and fix type in docs * rebase: * update get_started * update .gitignore * remove extra lines * remove extra 'S' * update * update * update docs * update docs * update docs --------- Co-authored-by: gaotongxiao <gaotongxiao@gmail.com>
This commit is contained in:
parent
72e6fc7756
commit
0c6fb6cf67
3
.gitignore
vendored
3
.gitignore
vendored
@ -82,3 +82,6 @@ instance/
|
||||
# Auto generate documentation
|
||||
docs/en/_build/
|
||||
docs/zh_cn/_build/
|
||||
|
||||
# .zip
|
||||
*.zip
|
||||
|
@ -9,8 +9,8 @@
|
||||
<!-- [](https://pypi.org/project/opencompass/) -->
|
||||
|
||||
[🌐Website](https://opencompass.org.cn/) |
|
||||
[📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
|
||||
[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/install.html) |
|
||||
[📘Documentation](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
|
||||
[🛠️Installation](https://opencompass.readthedocs.io/zh_CN/latest/get_started.html) |
|
||||
[🤔Reporting Issues](https://github.com/InternLM/opencompass/issues/new/choose)
|
||||
|
||||
[English](/README.md) | 简体中文
|
||||
@ -280,7 +280,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
# 安装
|
||||
## 安装
|
||||
|
||||
下面展示了快速安装的步骤。有部分第三方功能可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_cn/latest/get_started.html)。
|
||||
|
||||
|
@ -1,29 +1,51 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.piqa.piqa_ppl import piqa_datasets
|
||||
from .datasets.winograd.winograd_ppl import winograd_datasets
|
||||
from .datasets.siqa.siqa_gen import siqa_datasets
|
||||
|
||||
datasets = piqa_datasets
|
||||
datasets = [*siqa_datasets, *winograd_datasets]
|
||||
|
||||
models = [
|
||||
# OPT-1.3b
|
||||
dict(
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
# OPT-350M
|
||||
opt350m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
path='facebook/opt-1.3b',
|
||||
tokenizer_path='facebook/opt-1.3b',
|
||||
# the folowing are HuggingFaceCausalLM init parameters
|
||||
path='facebook/opt-350m',
|
||||
tokenizer_path='facebook/opt-350m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True,
|
||||
),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=16,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
max_seq_len=2048,
|
||||
# the folowing are not HuggingFaceCausalLM init parameters
|
||||
abbr='opt350m', # Model abbreviation
|
||||
max_out_len=100, # Maximum number of generated tokens
|
||||
batch_size=64,
|
||||
run_cfg=dict(num_gpus=1), # Run configuration for specifying resource requirements
|
||||
)
|
||||
]
|
||||
|
||||
# OPT-125M
|
||||
opt125m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
# the folowing are HuggingFaceCausalLM init parameters
|
||||
path='facebook/opt-125m',
|
||||
tokenizer_path='facebook/opt-125m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
max_seq_len=2048,
|
||||
# the folowing are not HuggingFaceCausalLM init parameters
|
||||
abbr='opt125m', # Model abbreviation
|
||||
max_out_len=100, # Maximum number of generated tokens
|
||||
batch_size=128,
|
||||
run_cfg=dict(num_gpus=1), # Run configuration for specifying resource requirements
|
||||
)
|
||||
|
||||
models = [opt350m, opt125m]
|
31
docs/en/_static/image/logo_icon.svg
Normal file
31
docs/en/_static/image/logo_icon.svg
Normal file
@ -0,0 +1,31 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
|
||||
<defs>
|
||||
<style>
|
||||
.cls-1 {
|
||||
fill: #36569b;
|
||||
}
|
||||
|
||||
.cls-2 {
|
||||
fill: #1b3882;
|
||||
}
|
||||
|
||||
.cls-3 {
|
||||
fill: #5878b4;
|
||||
}
|
||||
</style>
|
||||
</defs>
|
||||
<g id="_图层_1-2" data-name="图层 1">
|
||||
<g>
|
||||
<g id="_3" data-name="3">
|
||||
<path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
|
||||
</g>
|
||||
<g id="_2" data-name="2">
|
||||
<path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
|
||||
</g>
|
||||
<g id="_1" data-name="1">
|
||||
<path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 2.1 KiB |
@ -1,6 +1,6 @@
|
||||
# Installation
|
||||
|
||||
1. Use the following commands to set up the OpenCompass environment:
|
||||
1. Set up the OpenCompass environment:
|
||||
|
||||
```bash
|
||||
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
||||
@ -12,14 +12,17 @@ If you want to customize the PyTorch version or related CUDA version, please ref
|
||||
2. Install OpenCompass:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/opencompass/opencompass
|
||||
git clone https://github.com/InternLM/opencompass.git
|
||||
cd opencompass
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
3. Install humaneval (Optional)
|
||||
|
||||
If you want to perform evaluations on the humaneval dataset, follow these steps.
|
||||
If you want to **evaluate your models coding ability on the humaneval dataset**, execute this step otherwise skip it.
|
||||
|
||||
<details>
|
||||
<summary><b>click to show the details</b></summary>
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openai/human-eval.git
|
||||
@ -31,100 +34,139 @@ cd ..
|
||||
|
||||
Please read the comments in `human_eval/execution.py` **lines 48-57** to understand the potential risks of executing the model generation code. If you accept these risks, uncomment **line 58** to enable code execution evaluation.
|
||||
|
||||
# Quick Start
|
||||
</details>
|
||||
|
||||
In this section, we will use the example of testing LLaMA-7B on SIQA and PIQA to familiarize you with some
|
||||
basic features of OpenCompass. Before running, make sure you have installed OpenCompass and have GPU computing
|
||||
resources that meet the minimum requirements for LLaMA-7B.
|
||||
# Dataset Preparation
|
||||
|
||||
## Prepare the Dataset
|
||||
The datasets supported by OpenCompass mainly include two parts:
|
||||
|
||||
To start a simple evaluation task using OpenCompass, you generally need to follow three steps:
|
||||
1. Huggingface datasets: The [Huggingface Datasets](https://huggingface.co/datasets) provide a large number of datasets, which will **automatically download** when running with this option.
|
||||
2. Custom dataset: OpenCompass also provides some Chinese custom **self-built** datasets. Please run the following command to **manually download and extract** them.
|
||||
|
||||
1. **Prepare dataset configurations** - [`configs/datasets`](https://github.com/open-mmlab/OpenCompass/tree/main/configs/datasets) provides over 50 datasets supported by OpenCompass.
|
||||
2. **Prepare model configurations** - The [`configs/models`](https://github.com/open-mmlab/OpenCompass/tree/main/configs/models) contains sample configuration files for already supported large models including those based on HuggingFace and similar APIs like ChatGPT.
|
||||
3. **Use the 'run' script to launch** - Supported commands include running locally or on Slurm, testing multiple datasets and models at once.
|
||||
|
||||
In this example, we will demonstrate how to test the performance of pre-trained base models from LLaMA-7B on two benchmark tasks, SIQA and PIQA. Before proceeding, ensure that you have installed OpenCompass and have access to sufficient computing resources with GPU support that meet the minimum requirements for LLaMA-7B.
|
||||
|
||||
To initiate the evaluation task on your local machine, use the following command:
|
||||
Run the following commands to download and place the datasets in the '${OpenCompass}/data' directory can complete dataset preparation.
|
||||
|
||||
```bash
|
||||
python run.py configs/eval_llama_7b.py --debug
|
||||
# Run in the OpenCompass directory
|
||||
wget https://github.com/InternLM/opencompass/releases/download/0.1.0/OpenCompassData.zip
|
||||
unzip OpenCompassData.zip
|
||||
```
|
||||
|
||||
Here's a detailed step-by-step explanation of this case study:
|
||||
OpenCompass has supported most of the datasets commonly used for performance comparison, please refer to `configs/dataset` for the specific list of supported datasets.
|
||||
|
||||
# Quick Start
|
||||
|
||||
The evaluation of OpenCompass relies on configuration files which must contain fields **`datasets`** and **`models`**.
|
||||
The configurations specify the models and datasets to evaluate using **"run.py"**.
|
||||
|
||||
We will demonstrate some basic features of OpenCompass through evaluating pretrained models [OPT-125M](https://huggingface.co/facebook/opt-125m) and [OPT-350M](https://huggingface.co/facebook/opt-350m) on both [SIQA](https://huggingface.co/datasets/social_i_qa) and [Winograd](https://huggingface.co/datasets/winogrande) benchmark tasks with their config file located at [configs/eval_demo.py](https://github.com/InternLM/opencompass/blob/main/configs/eval_demo.py).
|
||||
|
||||
Before running this experiment, please make sure you have installed OpenCompass locally and it should run successfully under one _GTX-1660-6G_ GPU.
|
||||
For larger parameterized models like Llama-7B, refer to other examples provided in the [configs directory](https://github.com/InternLM/opencompass/tree/main/configs).
|
||||
|
||||
To start the evaluation task, use the following command:
|
||||
|
||||
```bash
|
||||
python run.py configs/eval_demo.py --debug
|
||||
```
|
||||
|
||||
While running the demo, let's go over the details of the configuration content and launch options used in this case.
|
||||
|
||||
## Step by step
|
||||
|
||||
<details>
|
||||
<summary>prepare datasets</summary>
|
||||
|
||||
The SiQA and PiQA benchmarks can be automatically downloaded through their respective links here and here, so no manual downloading is required here. However, some other datasets may require manual downloads. Please refer to the documentation [Prepare Datasets](./user_guides/dataset_prepare.md) for more information.
|
||||
|
||||
Create a '.py' configuration file and add the following content:
|
||||
<summary><b>Learn about `datasets`</b></summary>
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
# Read the required dataset configurations directly from the preset dataset configurations
|
||||
from .datasets.piqa.piqa_ppl import piqa_datasets
|
||||
from .datasets.siqa.siqa_gen import siqa_datasets
|
||||
from .datasets.winograd.winograd_ppl import winograd_datasets # ppl inference
|
||||
from .datasets.siqa.siqa_gen import siqa_datasets # gen inference
|
||||
|
||||
# Concatenate the datasets to be evaluated into the datasets field
|
||||
datasets = [*piqa_datasets, *siqa_datasets]
|
||||
datasets = [*siqa_datasets, *winograd_datasets] # Concatenate the datasets to be evaluated into the datasets field
|
||||
```
|
||||
|
||||
Various dataset configurations are available in [configs/datasets](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets).
|
||||
Some datasets have two types of configuration files within their folders named `'ppl'` and `'gen'`, representing different evaluation methods. Specifically, `'ppl'` represents discriminative evaluation, while `'gen'` stands for generative evaluation.
|
||||
|
||||
[configs/datasets/collections](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets/collections) contains various collections of datasets for comprehensive evaluation purposes.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>prepare models</summary>
|
||||
<summary><b>Learn about `models`</b></summary>
|
||||
|
||||
The pretrained model 'huggyllama/llama-7b' from HuggingFace supports automatic downloading. Add the following line to your configuration file:
|
||||
The pretrained models 'facebook/opt-350m' and 'facebook/opt-125m' from HuggingFace supports automatic downloading.
|
||||
|
||||
```python
|
||||
# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceCausalLM`
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
llama_7b = dict(
|
||||
# OPT-350M
|
||||
opt350m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
# Initialization parameters for `HuggingFaceCausalLM`
|
||||
path='huggyllama/llama-7b',
|
||||
tokenizer_path='huggyllama/llama-7b',
|
||||
tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
|
||||
path='facebook/opt-350m',
|
||||
tokenizer_path='facebook/opt-350m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
max_seq_len=2048,
|
||||
# Common parameters for all models, not specific to HuggingFaceCausalLM's initialization parameters
|
||||
abbr='llama-7b', # Model abbreviation for result display
|
||||
abbr='opt350m', # Model abbreviation for result display
|
||||
max_out_len=100, # Maximum number of generated tokens
|
||||
batch_size=16,
|
||||
batch_size=64, # batchsize
|
||||
run_cfg=dict(num_gpus=1), # Run configuration for specifying resource requirements
|
||||
)
|
||||
|
||||
models = [llama_7b]
|
||||
# OPT-125M
|
||||
opt125m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
# Initialization parameters for `HuggingFaceCausalLM`
|
||||
path='facebook/opt-125m',
|
||||
tokenizer_path='facebook/opt-125m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
max_seq_len=2048,
|
||||
# Common parameters for all models, not specific to HuggingFaceCausalLM's initialization parameters
|
||||
abbr='opt125m', # Model abbreviation for result display
|
||||
max_out_len=100, # Maximum number of generated tokens
|
||||
batch_size=128, # batchsize
|
||||
run_cfg=dict(num_gpus=1), # Run configuration for specifying resource requirements
|
||||
)
|
||||
|
||||
models = [opt350m, opt125m]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Launch Evaluation</summary>
|
||||
<summary><b>Launch Evaluation</b></summary>
|
||||
|
||||
First, we can start the task in **debug mode** to check for any exceptions in model loading, dataset reading, or incorrect cache usage.
|
||||
|
||||
```shell
|
||||
python run.py configs/llama.py -w outputs/llama --debug
|
||||
python run.py configs/eval_demo.py -w outputs/demo --debug
|
||||
```
|
||||
|
||||
However, in `--debug` mode, tasks are executed sequentially. After confirming that everything is correct, you
|
||||
can disable the `--debug` mode to fully utilize multiple GPUs.
|
||||
|
||||
```shell
|
||||
python run.py configs/llama.py -w outputs/llama
|
||||
python run.py configs/eval_demo.py -w outputs/demo
|
||||
```
|
||||
|
||||
Here are some parameters related to evaluation that can help you configure more efficient inference tasks based on your environment:
|
||||
|
||||
- `-w outputs/llama`: Directory to save evaluation logs and results.
|
||||
- `-w outputs/demo`: Directory to save evaluation logs and results.
|
||||
- `-r`: Restart the previous (interrupted) evaluation.
|
||||
- `--mode all`: Specify a specific stage of the task.
|
||||
- all: Perform a complete evaluation, including inference and evaluation.
|
||||
@ -137,9 +179,13 @@ Here are some parameters related to evaluation that can help you configure more
|
||||
If you are not performing the evaluation on your local machine but using a Slurm cluster, you can specify the following parameters:
|
||||
|
||||
- `--slurm`: Submit tasks using Slurm on the cluster.
|
||||
- `--partition my_part`: Slurm cluster partition.
|
||||
- `--partition(-p) my_part`: Slurm cluster partition.
|
||||
- `--retry 2`: Number of retries for failed tasks.
|
||||
|
||||
```{tip}
|
||||
The entry also supports submitting tasks to Alibaba Deep Learning Center (DLC), and more customized evaluation strategies. Please refer to [Launching an Evaluation Task](./user_guides/experimentation.md#launching-an-evaluation-task) for details.
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Obtaining Evaluation Results
|
||||
@ -147,32 +193,39 @@ If you are not performing the evaluation on your local machine but using a Slurm
|
||||
After the evaluation is complete, the evaluation results table will be printed as follows:
|
||||
|
||||
```text
|
||||
dataset version metric mode llama-7b
|
||||
--------- --------- -------- ------ ----------
|
||||
piqa 1cf9f0 accuracy ppl 77.75
|
||||
siqa e78df3 accuracy gen 36.08
|
||||
dataset version metric mode opt350m opt125m
|
||||
--------- --------- -------- ------ --------- ---------
|
||||
siqa e78df3 accuracy gen 21.55 12.44
|
||||
winograd b6c7ed accuracy ppl 51.23 49.82
|
||||
```
|
||||
|
||||
All run outputs will default to `outputs/default/` directory with following structure:
|
||||
|
||||
```markdown
|
||||
```text
|
||||
outputs/default/
|
||||
├── 20200220_120000
|
||||
├── ...
|
||||
├── 20230220_183030
|
||||
│ ├── configs
|
||||
│ ├── logs
|
||||
├── 20230220_183030 # one experiment pre folder
|
||||
│ ├── configs # replicable config files
|
||||
│ ├── logs # log files for both inference and evaluation stages
|
||||
│ │ ├── eval
|
||||
│ │ └── infer
|
||||
│ ├── predictions
|
||||
│ │ └── MODEL1
|
||||
│ └── results
|
||||
│ └── MODEL1
|
||||
│ ├── predictions # json format of per data point inference result
|
||||
│ └── results # numerical conclusions of each evaluation session
|
||||
├── ...
|
||||
```
|
||||
|
||||
Inside each timestamp folder there would be below items:
|
||||
Each timestamp folder represents one experiment with the following contents:
|
||||
|
||||
- configs folder, used for storing configuration files corresponding to this output dir using current time stamp;
|
||||
- logs folder, used for storing inference and evaluation log files of different models;
|
||||
- predictions folder, used for storing inference json result file(s), grouped by model;
|
||||
- results folder, used for storing evaluation json result file(s), grouped by model.
|
||||
- `configs`: configuration file storage;
|
||||
- `logs`: log file storage for both **inference** and **evaluation** stages;
|
||||
- `predictions`: json format output of inference result per data points;
|
||||
- `results`: json format output of numerical conclusion on each evaluation session.
|
||||
|
||||
## Additional Tutorials
|
||||
|
||||
To learn more about using OpenCompass, explore the following tutorials:
|
||||
|
||||
- [Preparing Datasets](./user\_guides/dataset\_prepare.md)
|
||||
- [Customizing Models](./user\_guides/models.md)
|
||||
- [Exploring Experimentation Workflows](./user\_guides/experimentation.md)
|
||||
- [Understanding Prompts](./prompt/overview.md)
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Preparing and Selecting Datasets
|
||||
# Configure DataSets
|
||||
|
||||
This section of the tutorial mainly focuses on how to prepare the datasets supported by OpenCompass and build configuration files to complete dataset selection.
|
||||
|
||||
|
@ -1,28 +1,35 @@
|
||||
# Task Execution and Monitoring
|
||||
|
||||
## Initiation of Assessment Task
|
||||
## Launching an Evaluation Task
|
||||
|
||||
The program entry for the assessment task is `run.py`, its usage is as follows:
|
||||
The program entry for the evaluation task is `run.py`, its usage is as follows:
|
||||
|
||||
```shell
|
||||
run.py [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l LARK] config
|
||||
python run.py $Config {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l]
|
||||
```
|
||||
|
||||
Here are some examples for launching the task in different environments:
|
||||
|
||||
- Running locally: `run.py $Config`, where `$Config` does not contain fields 'eval' and 'infer'.
|
||||
- Running with Slurm: `run.py $Config --slurm -p $PARTITION_name`.
|
||||
- Running on ALiYun DLC: `run.py $Config --dlc --aliyun-cfg $AliYun_Cfg`, tutorial will come later.
|
||||
- Customized run: `run.py $Config`, where `$Config` contains fields 'eval' and 'infer', and you are able to customize the way how each task will be split and launched. See [Evaluation document](./evaluation.md).
|
||||
|
||||
The parameter explanation is as follows:
|
||||
|
||||
- -p Specify the slurm partition;
|
||||
- -q Specify the slurm quotatype (default is auto), with optional values being reserved, auto, spot;
|
||||
- --debug When enabled, inference and evaluation tasks will run in single-process mode, and output will be echoed in real-time for debugging;
|
||||
- -m Run mode, default is all. It can be specified as infer to only run inference and obtain output results; if there are already model outputs in {WORKDIR}, it can be specified as eval to only run evaluation and obtain evaluation results; if there are individual evaluation results in results, it can be specified as viz to only run visualization; if specified as all, both inference and evaluation tasks run at the same time.
|
||||
- -r Reuse existing inference results. If followed by a timestamp, the result under that timestamp in the workspace path will be reused; otherwise, the latest result in the specified workspace path will be reused.
|
||||
- -w Specify the working path, default is ./outputs/default
|
||||
- -l Enable status reporting via Lark bot.
|
||||
- `-p`: Specify the slurm partition;
|
||||
- `-q`: Specify the slurm quotatype (default is None), with optional values being reserved, auto, spot. This parameter may only be used in some slurm variants;
|
||||
- `--debug`: When enabled, inference and evaluation tasks will run in single-process mode, and output will be echoed in real-time for debugging;
|
||||
- `-m`: Running mode, default is `all`. It can be specified as `infer` to only run inference and obtain output results; if there are already model outputs in `{WORKDIR}`, it can be specified as `eval` to only run evaluation and obtain evaluation results; if the evaluation results are ready, it can be specified as `viz` to only run visualization, which summarizes the results in tables; if specified as `all`, a full run will be performed, which includes inference, evaluation, and visualization.
|
||||
- `-r`: Reuse existing inference results, and skip the finished tasks. If followed by a timestamp, the result under that timestamp in the workspace path will be reused; otherwise, the latest result in the specified workspace path will be reused.
|
||||
- `-w`: Specify the working path, default is `./outputs/default`.
|
||||
- `-l`: Enable status reporting via Lark bot.
|
||||
|
||||
Using run mode `-m all` as an example, the overall execution flow is as follows:
|
||||
|
||||
1. Read the configuration file, parse out the model, dataset, evaluator, and other configuration information
|
||||
2. The evaluation task mainly includes three stages: inference infer, evaluation eval, and visualization viz. After task division by Partitioner, they are handed over to Runner for parallel execution. Individual inference and evaluation tasks are abstracted into OpenICLInferTask and OpenICLEvalTask respectively.
|
||||
3. After each stage ends, the visualization stage will read the evaluation results in results to generate a visualization report.
|
||||
2. The evaluation task mainly includes three stages: inference `infer`, evaluation `eval`, and visualization `viz`. After task division by Partitioner, they are handed over to Runner for parallel execution. Individual inference and evaluation tasks are abstracted into `OpenICLInferTask` and `OpenICLEvalTask` respectively.
|
||||
3. After each stage ends, the visualization stage will read the evaluation results in `results/` to generate a table.
|
||||
|
||||
## Task Monitoring: Lark Bot
|
||||
|
||||
@ -54,10 +61,6 @@ Configuration method:
|
||||
python run.py configs/eval_demo.py -p {PARTITION} -l
|
||||
```
|
||||
|
||||
## Introduction of Summerizer
|
||||
|
||||
It is mainly used to visualize evaluation results.
|
||||
|
||||
## Run Results
|
||||
|
||||
All run results will be placed in `outputs/default/` directory by default, the directory structure is shown below:
|
||||
@ -78,9 +81,12 @@ outputs/default/
|
||||
```
|
||||
|
||||
Each timestamp contains the following content:
|
||||
|
||||
- configs folder, which stores the configuration files corresponding to each run with this timestamp as the output directory;
|
||||
- logs folder, which stores the output log files of the inference and evaluation phases, each folder will store logs in subfolders by model;
|
||||
- predictions folder, which stores the inferred json results, with a model subfolder;
|
||||
- results folder, which stores the evaluated json results, with a model subfolder.
|
||||
|
||||
Also, all `-r` without specifying a corresponding timestamp will select the newest folder by sorting as the output directory.
|
||||
|
||||
## Introduction of Summerizer (to be updated)
|
||||
|
31
docs/zh_cn/_static/image/logo_icon.svg
Normal file
31
docs/zh_cn/_static/image/logo_icon.svg
Normal file
@ -0,0 +1,31 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
|
||||
<defs>
|
||||
<style>
|
||||
.cls-1 {
|
||||
fill: #36569b;
|
||||
}
|
||||
|
||||
.cls-2 {
|
||||
fill: #1b3882;
|
||||
}
|
||||
|
||||
.cls-3 {
|
||||
fill: #5878b4;
|
||||
}
|
||||
</style>
|
||||
</defs>
|
||||
<g id="_图层_1-2" data-name="图层 1">
|
||||
<g>
|
||||
<g id="_3" data-name="3">
|
||||
<path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
|
||||
</g>
|
||||
<g id="_2" data-name="2">
|
||||
<path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
|
||||
</g>
|
||||
<g id="_1" data-name="1">
|
||||
<path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 2.1 KiB |
@ -1,6 +1,6 @@
|
||||
# 安装
|
||||
|
||||
1. 使用以下命令准备 OpenCompass 环境:
|
||||
1. 准备 OpenCompass 运行环境:
|
||||
|
||||
```bash
|
||||
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
||||
@ -12,14 +12,17 @@ conda activate opencompass
|
||||
2. 安装 OpenCompass:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/opencompass/opencompass
|
||||
git clone https://github.com/InternLM/opencompass.git
|
||||
cd opencompass
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
3. 安装 humaneval(可选)
|
||||
3. 安装 humaneval(可选):
|
||||
|
||||
如果你需要在 humaneval 数据集上进行评估,请执行此步骤,否则忽略这一步。
|
||||
如果你需要**在 humaneval 数据集上评估模型代码能力**,请执行此步骤,否则忽略这一步。
|
||||
|
||||
<details>
|
||||
<summary><b>点击查看详细</b></summary>
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openai/human-eval.git
|
||||
@ -31,96 +34,138 @@ cd ..
|
||||
|
||||
请仔细阅读 `human_eval/execution.py` **第48-57行**的注释,了解执行模型生成的代码可能存在的风险,如果接受这些风险,请取消**第58行**的注释,启用代码执行评测。
|
||||
|
||||
</details>
|
||||
|
||||
# 数据集准备
|
||||
|
||||
OpenCompass 支持的数据集主要包括两个部分:
|
||||
|
||||
1. Huggingface 数据集: [Huggingface Dataset](https://huggingface.co/datasets) 提供了大量的数据集,这部分数据集运行时会**自动下载**。
|
||||
|
||||
2. 自建以及第三方数据集:OpenCompass 还提供了一些第三方数据集及自建**中文**数据集。运行以下命令**手动下载解压**。
|
||||
|
||||
在 OpenCompass 项目根目录下运行下面命令,将数据集准备至 '${OpenCompass}/data' 目录下:
|
||||
|
||||
```bash
|
||||
wget https://github.com/InternLM/opencompass/releases/download/0.1.0/OpenCompassData.zip
|
||||
unzip OpenCompassData.zip
|
||||
```
|
||||
|
||||
OpenCompass 已经支持了大多数常用于性能比较的数据集,具体支持的数据集列表请直接在 `configs/datasets` 下进行查找。
|
||||
|
||||
# 快速上手
|
||||
|
||||
启动一个简单评测任务一般需要三个步骤:
|
||||
OpenCompass 的评测以配置文件为中心,必须包含 `datasets` 和 `models` 字段,配置需要评测的模型以及数据集,使用入口 'run.py' 启动。
|
||||
|
||||
1. **准备数据集及其配置**, [`configs/datasets`](https://github.com/open-mmlab/OpenCompass/tree/main/configs/datasets) 提供了 OpenCompass 已经支持的 50 多种数据集。
|
||||
2. **准备模型配置**,[`configs/models`](https://github.com/open-mmlab/OpenCompass/tree/main/configs/models) 提供已经支持的大模型样例, 包括基于 HuggingFace 的模型以及类似 ChatGPT 的 API 模型。
|
||||
3. **使用 `run` 脚本启动**, 支持一行命令在本地或者 slurm 上启动评测,支持一次测试多个数据集,多个模型。
|
||||
我们会以测试 [OPT-125M](https://huggingface.co/facebook/opt-125m) 以及 [OPT-350M](https://huggingface.co/facebook/opt-350m) 预训练基座模型在 [SIQA](https://huggingface.co/datasets/social_i_qa) 和 [Winograd](https://huggingface.co/datasets/winogrande) 上的性能为例,带领你熟悉 OpenCompass 的一些基本功能。
|
||||
本次的测试的配置文件为[configs/eval_demo.py](https://github.com/InternLM/opencompass/blob/main/configs/eval_demo.py)。
|
||||
|
||||
我们会以测试 LLaMA-7B 预训练基座模型在 SIQA 和 PIQA 上的性能为例,带领你熟悉 OpenCompass 的一些基本功能。在运行前,
|
||||
请先确保你安装好了 OpenCompass,并在本机或集群上有满足 LLaMA-7B 最低要求的 GPU 计算资源。
|
||||
运行前确保已经安装了 OpenCompass,本实验可以在单张 _GTX-1660-6G_ 显卡上成功运行。
|
||||
更大参数的模型,如 Llama-7B, 可参考 [configs](https://github.com/InternLM/opencompass/tree/main/configs) 中其他例子。
|
||||
|
||||
使用以下命令在本地启动评测任务(运行中需要联网自动下载数据集和模型,模型下载较慢):
|
||||
|
||||
```bash
|
||||
python run.py configs/eval_llama_7b.py --debug
|
||||
python run.py configs/eval_demo.py
|
||||
```
|
||||
|
||||
下面是这个案例的详细步骤解释。
|
||||
运行 demo 期间,我们来仔细解本案例中的配置内容以及启动选项。
|
||||
|
||||
## 详细步骤
|
||||
## 步骤详解
|
||||
|
||||
<details>
|
||||
<summary>准备数据集及其配置</summary>
|
||||
|
||||
因为 [siqa](https://huggingface.co/datasets/siqa), [piqa](https://huggingface.co/datasets/piqa) 支持自动下载,所以这里不需要手动下载数据集,但有部分数据集可能需要手动下载,详细查看文档 [准备数据集](./user_guides/dataset_prepare.md).
|
||||
|
||||
创建一个 '.py' 配置文件, 添加以下内容:
|
||||
<summary><b>数据集列表`datasets`</b></summary>
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base # 使用 mmengine 的 config 机制
|
||||
|
||||
with read_base():
|
||||
# 直接从预设数据集配置中读取需要的数据集配置
|
||||
from .datasets.piqa.piqa_ppl import piqa_datasets
|
||||
from .datasets.winograd.winograd_ppl import winograd_datasets
|
||||
from .datasets.siqa.siqa_gen import siqa_datasets
|
||||
|
||||
datasets = [*piqa_datasets, *siqa_datasets] # 最后 config 需要包含所需的评测数据集列表 datasets
|
||||
datasets = [*siqa_datasets, *winograd_datasets] # 最后 config 需要包含所需的评测数据集列表 datasets
|
||||
```
|
||||
|
||||
[configs/datasets](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets) 包含各种数据集预先定义好的配置文件,如 [piqa](https://github.com/InternLM/OpenCompass/blob/main/configs/) 文件夹下有不同 Prompt 版本的 piqa 定义,其中 `ppl` 表示使用判别式评测, `gen` 表示使用生成式评测。[configs/datasets/collections](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets/collections) 存放了各类数据集集合,方便做综合评测。
|
||||
[configs/datasets](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets) 包含各种数据集预先定义好的配置文件;
|
||||
部分数据集文件夹下有 'ppl' 和 'gen' 两类配置文件,表示使用的评估方式,其中 `ppl` 表示使用判别式评测, `gen` 表示使用生成式评测。
|
||||
[configs/datasets/collections](https://github.com/InternLM/OpenCompass/blob/main/configs/datasets/collections) 存放了各类数据集集合,方便做综合评测。
|
||||
|
||||
更多信息可查看 [配置数据集](./user_guides/dataset_prepare.md)
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>准备模型</summary>
|
||||
<summary><b>模型列表`models`</b></summary>
|
||||
|
||||
[configs/models](https://github.com/InternLM/OpenCompass/blob/main/configs/models) 包含多种已经支持的模型案案例,如 gpt3.5, hf_llama 等。
|
||||
|
||||
HuggingFace 中的 'huggyllama/llama-7b' 支持自动下载,在配置文件中添加以下内容:
|
||||
HuggingFace 中的 'facebook/opt-350m' 以及 'facebook/opt-125m' 支持自动下载权重,所以不需要额外下载权重:
|
||||
|
||||
```python
|
||||
from opencompass.models import HuggingFaceCausalLM # 提供直接使用 HuggingFaceCausalLM 模型的接口
|
||||
|
||||
llama_7b = dict(
|
||||
# OPT-350M
|
||||
opt350m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
# 以下参数为 HuggingFaceCausalLM 的初始化参数
|
||||
path='huggyllama/llama-7b',
|
||||
tokenizer_path='huggyllama/llama-7b',
|
||||
tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
|
||||
path='facebook/opt-350m',
|
||||
tokenizer_path='facebook/opt-350m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
max_seq_len=2048,
|
||||
# 以下参数为各类模型都有的参数,非 HuggingFaceCausalLM 的初始化参数
|
||||
abbr='llama-7b', # 模型简称,用于结果展示
|
||||
# 下参数为各类模型都有的参数,非 HuggingFaceCausalLM 的初始化参数
|
||||
abbr='opt350m', # 模型简称,用于结果展示
|
||||
max_out_len=100, # 最长生成 token 数
|
||||
batch_size=16, # 批次大小
|
||||
batch_size=64, # 批次大小
|
||||
run_cfg=dict(num_gpus=1), # 运行配置,用于指定资源需求
|
||||
)
|
||||
|
||||
models = [llama_7b] # 最后 config 需要包含所需的模型列表 models
|
||||
# OPT-125M
|
||||
opt125m = dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
# 以下参数为 HuggingFaceCausalLM 的初始化参数
|
||||
path='facebook/opt-125m',
|
||||
tokenizer_path='facebook/opt-125m',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
proxies=None,
|
||||
trust_remote_code=True),
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
max_seq_len=2048,
|
||||
# 下参数为各类模型都有的参数,非 HuggingFaceCausalLM 的初始化参数
|
||||
abbr='opt125m', # 模型简称,用于结果展示
|
||||
max_out_len=100, # 最长生成 token 数
|
||||
batch_size=128, # 批次大小
|
||||
run_cfg=dict(num_gpus=1), # 运行配置,用于指定资源需求
|
||||
)
|
||||
|
||||
models = [opt350m, opt125m]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>启动评测</summary>
|
||||
<summary><b>启动评测</b></summary>
|
||||
|
||||
首先,我们可以使用 debug 模式启动任务,以检查模型加载、数据集读取是否出现异常,如未正确读取缓存等。
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_llama_7b.py -w outputs/llama --debug
|
||||
python run.py configs/eval_demo.py -w outputs/demo --debug
|
||||
```
|
||||
|
||||
但 `--debug` 模式下只能逐一序列执行任务,因此检查无误后,可关闭 `--debug` 模式,使程序充分利用多卡资源
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_llama_7b.py -w outputs/llama
|
||||
python run.py configs/eval_demo.py -w outputs/demo
|
||||
```
|
||||
|
||||
以下是一些与评测相关的参数,可以帮助你根据自己的环境情况配置更高效的推理任务。
|
||||
|
||||
- `-w outputs/llama`: 评测日志及结果保存目录
|
||||
- `-w outputs/demo`: 评测日志及结果保存目录
|
||||
- `-r`: 重启上一次(中断的)评测
|
||||
- `--mode all`: 指定进行某一阶段的任务
|
||||
- all: 进行全阶段评测,包括推理和评估
|
||||
@ -136,6 +181,12 @@ python run.py configs/eval_llama_7b.py -w outputs/llama
|
||||
- `--partition(-p) my_part`: slurm 集群分区
|
||||
- `--retry 2`: 任务出错重试次数
|
||||
|
||||
The entry also supports submitting tasks to Alibaba Deep Learning Center (DLC), and more customized evaluation strategies. Please refer to [Launching an Evaluation Task](./user_guides/experimentation.md#launching-an-evaluation-task) for details.
|
||||
|
||||
```{tip}
|
||||
这个脚本同样支持将任务提交到阿里云深度学习中心(DLC)上运行,以及更多定制化的评测策略。请参考 [评测任务发起](./user_guides/experimentation.md#评测任务发起) 了解更多细节。
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 评测结果
|
||||
@ -143,32 +194,39 @@ python run.py configs/eval_llama_7b.py -w outputs/llama
|
||||
评测完成后,会打印评测结果表格如下:
|
||||
|
||||
```text
|
||||
dataset version metric mode llama-7b
|
||||
--------- --------- -------- ------ ----------
|
||||
piqa 1cf9f0 accuracy ppl 77.75
|
||||
siqa e78df3 accuracy gen 36.08
|
||||
dataset version metric mode opt350m opt125m
|
||||
--------- --------- -------- ------ --------- ---------
|
||||
siqa e78df3 accuracy gen 21.55 12.44
|
||||
winograd b6c7ed accuracy ppl 51.23 49.82
|
||||
```
|
||||
|
||||
所有运行结果会默认放在`outputs/default/`目录下,目录结构如下所示:
|
||||
所有过程的日志,预测,以及最终结果会默认放在`outputs/default/`目录下。目录结构如下所示:
|
||||
|
||||
```text
|
||||
outputs/default/
|
||||
├── 20200220_120000
|
||||
├── ...
|
||||
├── 20230220_183030
|
||||
│ ├── configs
|
||||
│ ├── logs
|
||||
├── 20230220_183030 # 一次实验
|
||||
│ ├── configs # 可复现 config
|
||||
│ ├── logs # 日志
|
||||
│ │ ├── eval
|
||||
│ │ └── infer
|
||||
│ ├── predictions
|
||||
│ │ └── MODEL1
|
||||
│ └── results
|
||||
│ └── MODEL1
|
||||
│ ├── predictions # 推理结果,每一条数据推理结果
|
||||
│ └── results # 评估结论,一个评估实验的数值结论
|
||||
├── ...
|
||||
```
|
||||
|
||||
其中,每一个时间戳中存在以下内容:
|
||||
其中,每一个时间戳文件夹代表一次实验中存在以下内容:
|
||||
|
||||
- configs文件夹,用于存放以这个时间戳为输出目录的每次运行对应的配置文件;
|
||||
- logs文件夹,用于存放推理和评测两个阶段的输出日志文件,各个文件夹内会以模型为子文件夹存放日志;
|
||||
- predicitions文件夹,用于存放推理json结果,以模型为子文件夹;
|
||||
- results文件夹,用于存放评测json结果,以模型为子文件夹
|
||||
- 'configs':用于存放可复现配置文件;
|
||||
- 'logs':用于存放**推理**和**评测**两个阶段的日志文件
|
||||
- 'predicitions':用于存放推理结果,格式为json;
|
||||
- 'results': 用于存放评测最终结果总结。
|
||||
|
||||
## 更多教程
|
||||
|
||||
想要更多了解 OpenCompass, 可以点击下列链接学习。
|
||||
|
||||
- [如何配置数据集](./user_guides/dataset_prepare.md)
|
||||
- [如何定制模型](./user_guides/models.md)
|
||||
- [深入了解启动实验](./user_guides/experimentation.md)
|
||||
- [如何调Prompt](./prompt/overview.md)
|
||||
|
@ -71,8 +71,8 @@ OpenCompass 上手路线
|
||||
.. toctree::
|
||||
:caption: 切换语言
|
||||
|
||||
English <https://mmpretrain.readthedocs.io/en/latest/>
|
||||
简体中文 <https://mmpretrain.readthedocs.io/zh_CN/latest/>
|
||||
English <https://OpenCompass.readthedocs.io/en/latest/>
|
||||
简体中文 <https://OpenCompass.readthedocs.io/zh_CN/latest/>
|
||||
|
||||
|
||||
索引与表格
|
||||
|
@ -1,12 +1,12 @@
|
||||
# 数据集准备和选择
|
||||
# 数据集配置
|
||||
|
||||
本节教程主要关注如何准备 OpenCompass 已支持的数据集,并构建需要的配置文件完成数据集的选择。
|
||||
本节教程主要关注如何构建需要的配置文件完成数据集的选择。
|
||||
|
||||
## 数据集配置文件目录结构
|
||||
|
||||
首先简单介绍一下 OpenCompass `configs/datasets` 目录下的结构,如下所示:
|
||||
|
||||
```
|
||||
```text
|
||||
configs/datasets/
|
||||
├── agieval
|
||||
├── apps
|
||||
@ -31,26 +31,6 @@ configs/datasets/
|
||||
|
||||
除此之外,不带版本号的文件,例如: `CLUE_afqmc_gen.py` 则指向该评测方式最新的prompt配置文件,通常来说会是精度最高的prompt。
|
||||
|
||||
## 数据集准备
|
||||
|
||||
OpenCompass 支持的数据集主要包括两个部分:
|
||||
|
||||
1. Huggingface 数据集
|
||||
|
||||
[Huggingface Dataset](https://huggingface.co/datasets) 提供了大量的数据集。OpenCompass 已经支持了大多数常用于性能比较的数据集,具体支持的数据集列表请直接在 `configs/dataset` 下进行查找。
|
||||
|
||||
2. 第三方数据集
|
||||
|
||||
除了支持 Huggingface 已有的数据集, OpenCompass 还提供了一些第三方数据集及自建CN数据集。运行以下命令,将数据集统一下载并放置在`./data`目录下即可完成数据集准备。
|
||||
|
||||
```bash
|
||||
# 在 OpenCompass 目录下运行
|
||||
wget https://github.com/InternLM/opencompass/releases/download/0.1.0/OpenCompassData.zip
|
||||
unzip OpenCompassData.zip
|
||||
```
|
||||
|
||||
需要注意的是,Repo中不仅包含自建的数据集,为了方便也加入了部分HF已支持的数据集方便测试。
|
||||
|
||||
## 数据集选择
|
||||
|
||||
在各个数据集配置文件中,数据集将会被定义在 `{}_datasets` 变量当中,例如下面 `CLUE_afqmc/CLUE_afqmc_gen_db509b.py` 中的 `afqmc_datasets`。
|
||||
|
@ -5,7 +5,7 @@
|
||||
评测任务的程序入口为 `run.py`,使用方法如下:
|
||||
|
||||
```shell
|
||||
run.py {--slrum | --dlc | None} $Config [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l LARK]
|
||||
python run.py $Config {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l]
|
||||
```
|
||||
|
||||
启动方式:
|
||||
@ -17,19 +17,19 @@ run.py {--slrum | --dlc | None} $Config [-p PARTITION] [-q QUOTATYPE] [--debug]
|
||||
|
||||
参数解释如下:
|
||||
|
||||
- -p 指定 slurm 分区;
|
||||
- -q 指定 slurm quotatype (默认为 auto),可选 reserved, auto, spot;
|
||||
- --debug 开启时,推理和评测任务会以单进程模式运行,且输出会实时回显,便于调试;
|
||||
- -m 运行模式,默认为 all。可以指定为 infer 则仅运行推理,获得输出结果;如果在 {WORKDIR} 中已经有模型输出,则指定为 eval 仅运行评测,获得评测结果;如果在 results 中已有单项评测结果,则指定为 viz 仅运行可视化;指定为 all 则同时运行推理和评测。
|
||||
- -r 重用已有的推理结果。如果后面跟有时间戳,则会复用工作路径下该时间戳的结果;否则则复用指定工作路径下的最新结果。
|
||||
- -w 指定工作路径,默认为 ./outputs/default
|
||||
- -l 打开飞书机器人状态上报。
|
||||
- `-p`: 指定 slurm 分区;
|
||||
- `-q`: 指定 slurm quotatype(默认为 None),可选 reserved, auto, spot。该参数可能仅适用于部分 slurm 的变体;
|
||||
- `--debug`: 开启时,推理和评测任务会以单进程模式运行,且输出会实时回显,便于调试;
|
||||
- `-m`: 运行模式,默认为 `all`。可以指定为 `infer` 则仅运行推理,获得输出结果;如果在 `{WORKDIR}` 中已经有模型输出,则指定为 `eval` 仅运行评测,获得评测结果;如果在 `results/` 中已有单项评测结果,则指定为 `viz` 仅运行可视化;指定为 `all` 则同时运行推理和评测。
|
||||
- `-r`: 重用已有的推理结果。如果后面跟有时间戳,则会复用工作路径下该时间戳的结果;否则则复用指定工作路径下的最新结果。
|
||||
- `-w`: 指定工作路径,默认为 `./outputs/default`
|
||||
- `-l`: 打开飞书机器人状态上报。
|
||||
|
||||
以运行模式`-m all`为例,整体运行流如下:
|
||||
|
||||
1. 读取配置文件,解析出模型、数据集、评估器等配置信息
|
||||
2. 评测任务主要分为推理 infer、评测 eval 和可视化 viz 三个阶段,其中推理和评测经过 Partitioner 进行任务切分后,交由 Runner 负责并行执行。单个推理和评测任务则被抽象成 OpenICLInferTask 和 OpenICLEvalTask。
|
||||
3. 两阶段分别结束后,可视化阶段会读取 results 中的评测结果,生成可视化报告。
|
||||
2. 评测任务主要分为推理 `infer`、评测 `eval` 和可视化 `viz` 三个阶段,其中推理和评测经过 Partitioner 进行任务切分后,交由 Runner 负责并行执行。单个推理和评测任务则被抽象成 `OpenICLInferTask` 和 `OpenICLEvalTask`。
|
||||
3. 两阶段分别结束后,可视化阶段会读取 `results/` 中的评测结果,生成可视化报告。
|
||||
|
||||
## 任务监控:飞书机器人
|
||||
|
||||
@ -39,31 +39,27 @@ run.py {--slrum | --dlc | None} $Config [-p PARTITION] [-q QUOTATYPE] [--debug]
|
||||
|
||||
1. 打开 `configs/lark.py` 文件,并在文件中加入以下行:
|
||||
|
||||
```python
|
||||
lark_bot_url = 'YOUR_WEBHOOK_URL'
|
||||
```
|
||||
```python
|
||||
lark_bot_url = 'YOUR_WEBHOOK_URL'
|
||||
```
|
||||
|
||||
通常, Webhook URL 格式如 https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxxxx 。
|
||||
通常, Webhook URL 格式如 https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxxxx 。
|
||||
|
||||
2. 在完整的评测配置中继承该文件:
|
||||
|
||||
```python
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .lark import lark_bot_url
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
3. 为了避免机器人频繁发消息形成骚扰,默认运行时状态不会自动上报。有需要时,可以通过 `-l` 或 `--lark` 启动状态上报:
|
||||
|
||||
```bash
|
||||
python run.py configs/eval_demo.py -p {PARTITION} -l
|
||||
```
|
||||
|
||||
## Summerizer介绍
|
||||
|
||||
主要用于可视化评测结果。
|
||||
```bash
|
||||
python run.py configs/eval_demo.py -p {PARTITION} -l
|
||||
```
|
||||
|
||||
## 运行结果
|
||||
|
||||
@ -86,9 +82,11 @@ outputs/default/
|
||||
|
||||
其中,每一个时间戳中存在以下内容:
|
||||
|
||||
- configs文件夹,用于存放以这个时间戳为输出目录的每次运行对应的配置文件;
|
||||
- logs文件夹,用于存放推理和评测两个阶段的输出日志文件,各个文件夹内会以模型为子文件夹存放日志;
|
||||
- predicitions文件夹,用于存放推理json结果,以模型为子文件夹;
|
||||
- results文件夹,用于存放评测json结果,以模型为子文件夹
|
||||
- configs 文件夹,用于存放以这个时间戳为输出目录的每次运行对应的配置文件;
|
||||
- logs 文件夹,用于存放推理和评测两个阶段的输出日志文件,各个文件夹内会以模型为子文件夹存放日志;
|
||||
- predicitions 文件夹,用于存放推理 json 结果,以模型为子文件夹;
|
||||
- results 文件夹,用于存放评测 json 结果,以模型为子文件夹
|
||||
|
||||
另外,所有指定-r 但是没有指定对应时间戳将会按照排序选择最新的文件夹作为输出目录。
|
||||
|
||||
## Summerizer介绍 (待更新)
|
||||
|
Loading…
Reference in New Issue
Block a user