mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update OlympiadBench and Update LLM Judge (#1954)
This commit is contained in:
parent
5d2d253d83
commit
c98599271b
@ -34,6 +34,23 @@ problem,answer
|
|||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
|
### Using LLM for Evaluation via Command Line
|
||||||
|
|
||||||
|
Some datasets in OpenCompass already include LLM judge configurations.
|
||||||
|
You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
|
||||||
|
|
||||||
|
Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
|
||||||
|
export OC_JUDGE_API_KEY=sk-1234
|
||||||
|
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
|
||||||
|
|
||||||
|
### ### Using LLM for Evaluation via Configuration Files
|
||||||
|
|
||||||
To set up an LLM judge evaluation, you'll need to configure three main components:
|
To set up an LLM judge evaluation, you'll need to configure three main components:
|
||||||
|
|
||||||
1. Dataset Reader Configuration
|
1. Dataset Reader Configuration
|
||||||
|
@ -34,7 +34,24 @@ problem,answer
|
|||||||
|
|
||||||
## 配置说明
|
## 配置说明
|
||||||
|
|
||||||
要设置LLM评判评估,你需要配置三个主要组件:
|
### 基于命令行使用LLM进行评估
|
||||||
|
|
||||||
|
OpenCompass中部分数据集已经包含了LLM评判器的配置。
|
||||||
|
你需要使用一个模型服务(如OpenAI或DeepSeek官方提供的API)或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
|
||||||
|
|
||||||
|
然后,你可以通过以下命令设置相关评估服务的环境变量,并对模型进行评估:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
|
||||||
|
export OC_JUDGE_API_KEY=sk-1234
|
||||||
|
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
注意,默认情况下,OpenCompass会使用这三个环境变量,但如果你使用了基于配置文件的方式配置评估服务,这三个环境变量将不会生效。
|
||||||
|
|
||||||
|
### 基于配置文件使用LLM进行评估
|
||||||
|
|
||||||
|
对一个数据集设置LLM评判评估,你需要配置三个主要组件:
|
||||||
|
|
||||||
1. 数据集读取配置
|
1. 数据集读取配置
|
||||||
|
|
||||||
|
@ -16,7 +16,17 @@ math_categories = [
|
|||||||
'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
|
'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
|
||||||
]
|
]
|
||||||
|
|
||||||
|
physics_categories = [
|
||||||
|
'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
|
||||||
|
'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
OlympiadBenchMath_summary_groups = [
|
OlympiadBenchMath_summary_groups = [
|
||||||
{'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
|
{'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
OlympiadBenchPhysics_summary_groups = [
|
||||||
|
{'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
|
||||||
|
]
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
|
|||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
self.logger = get_logger()
|
self.logger = get_logger()
|
||||||
self.judge_cfg = judge_cfg
|
# If judge_cfg is not provided, fall back to the default configuration
|
||||||
|
if not judge_cfg:
|
||||||
|
self.judge_cfg = self.default_judge_cfg
|
||||||
|
else:
|
||||||
|
self.judge_cfg = judge_cfg
|
||||||
self.output_path = ''
|
self.output_path = ''
|
||||||
|
|
||||||
self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
|
self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
|
||||||
@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
|
|||||||
kwargs = self.dict_postprocessor
|
kwargs = self.dict_postprocessor
|
||||||
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
|
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||||
return proc(output, self.output_path, **kwargs)
|
return proc(output, self.output_path, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_judge_cfg(self):
|
||||||
|
from opencompass.models import OpenAISDK
|
||||||
|
|
||||||
|
DEFAULT_JUDGE_CFG = dict(
|
||||||
|
type=OpenAISDK,
|
||||||
|
path=os.environ['OC_JUDGE_MODEL'],
|
||||||
|
key=os.environ['OC_JUDGE_API_KEY'],
|
||||||
|
openai_api_base=[
|
||||||
|
os.environ.get('OC_JUDGE_API_BASE',
|
||||||
|
'https://api.openai.com/v1/')
|
||||||
|
],
|
||||||
|
meta_template=dict(round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
], ),
|
||||||
|
query_per_second=16,
|
||||||
|
batch_size=1024,
|
||||||
|
temperature=0.001,
|
||||||
|
tokenizer_path='gpt-4o-2024-05-13',
|
||||||
|
verbose=True,
|
||||||
|
max_out_len=16384,
|
||||||
|
max_seq_len=49152,
|
||||||
|
)
|
||||||
|
|
||||||
|
return DEFAULT_JUDGE_CFG
|
||||||
|
@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
|
|||||||
stop_words=model.get('stop_words', []),
|
stop_words=model.get('stop_words', []),
|
||||||
)
|
)
|
||||||
elif accelerator == 'lmdeploy':
|
elif accelerator == 'lmdeploy':
|
||||||
|
|
||||||
|
if model.get('generation_kwargs') is not None:
|
||||||
|
logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
|
||||||
|
gen_config = model['generation_kwargs'].copy()
|
||||||
|
else:
|
||||||
|
logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
|
||||||
|
|
||||||
mod = TurboMindModelwithChatTemplate
|
mod = TurboMindModelwithChatTemplate
|
||||||
acc_model = dict(
|
acc_model = dict(
|
||||||
type=f'{mod.__module__}.{mod.__name__}',
|
type=f'{mod.__module__}.{mod.__name__}',
|
||||||
@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
|
|||||||
session_len=model.get('max_seq_len', None),
|
session_len=model.get('max_seq_len', None),
|
||||||
max_new_tokens=model['max_out_len']
|
max_new_tokens=model['max_out_len']
|
||||||
),
|
),
|
||||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
|
gen_config=gen_config,
|
||||||
max_seq_len=model.get('max_seq_len', None),
|
max_seq_len=model.get('max_seq_len', None),
|
||||||
max_out_len=model['max_out_len'],
|
max_out_len=model['max_out_len'],
|
||||||
batch_size=16,
|
batch_size=16,
|
||||||
|
Loading…
Reference in New Issue
Block a user