mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update OlympiadBench and Update LLM Judge (#1954)
This commit is contained in:
parent
5d2d253d83
commit
c98599271b
@ -34,6 +34,23 @@ problem,answer
|
||||
|
||||
## Configuration
|
||||
|
||||
### Using LLM for Evaluation via Command Line
|
||||
|
||||
Some datasets in OpenCompass already include LLM judge configurations.
|
||||
You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
|
||||
|
||||
Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
|
||||
|
||||
```bash
|
||||
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
|
||||
export OC_JUDGE_API_KEY=sk-1234
|
||||
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
|
||||
```
|
||||
|
||||
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
|
||||
|
||||
### ### Using LLM for Evaluation via Configuration Files
|
||||
|
||||
To set up an LLM judge evaluation, you'll need to configure three main components:
|
||||
|
||||
1. Dataset Reader Configuration
|
||||
|
@ -34,7 +34,24 @@ problem,answer
|
||||
|
||||
## 配置说明
|
||||
|
||||
要设置LLM评判评估,你需要配置三个主要组件:
|
||||
### 基于命令行使用LLM进行评估
|
||||
|
||||
OpenCompass中部分数据集已经包含了LLM评判器的配置。
|
||||
你需要使用一个模型服务(如OpenAI或DeepSeek官方提供的API)或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
|
||||
|
||||
然后,你可以通过以下命令设置相关评估服务的环境变量,并对模型进行评估:
|
||||
|
||||
```bash
|
||||
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
|
||||
export OC_JUDGE_API_KEY=sk-1234
|
||||
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
|
||||
```
|
||||
|
||||
注意,默认情况下,OpenCompass会使用这三个环境变量,但如果你使用了基于配置文件的方式配置评估服务,这三个环境变量将不会生效。
|
||||
|
||||
### 基于配置文件使用LLM进行评估
|
||||
|
||||
对一个数据集设置LLM评判评估,你需要配置三个主要组件:
|
||||
|
||||
1. 数据集读取配置
|
||||
|
||||
|
@ -16,7 +16,17 @@ math_categories = [
|
||||
'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
|
||||
]
|
||||
|
||||
physics_categories = [
|
||||
'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
|
||||
'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
|
||||
]
|
||||
|
||||
|
||||
OlympiadBenchMath_summary_groups = [
|
||||
{'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
|
||||
]
|
||||
|
||||
|
||||
OlympiadBenchPhysics_summary_groups = [
|
||||
{'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
|
||||
]
|
||||
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import os.path as osp
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
|
||||
) -> None:
|
||||
|
||||
self.logger = get_logger()
|
||||
self.judge_cfg = judge_cfg
|
||||
# If judge_cfg is not provided, fall back to the default configuration
|
||||
if not judge_cfg:
|
||||
self.judge_cfg = self.default_judge_cfg
|
||||
else:
|
||||
self.judge_cfg = judge_cfg
|
||||
self.output_path = ''
|
||||
|
||||
self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
|
||||
@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
|
||||
kwargs = self.dict_postprocessor
|
||||
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||
return proc(output, self.output_path, **kwargs)
|
||||
|
||||
@property
|
||||
def default_judge_cfg(self):
|
||||
from opencompass.models import OpenAISDK
|
||||
|
||||
DEFAULT_JUDGE_CFG = dict(
|
||||
type=OpenAISDK,
|
||||
path=os.environ['OC_JUDGE_MODEL'],
|
||||
key=os.environ['OC_JUDGE_API_KEY'],
|
||||
openai_api_base=[
|
||||
os.environ.get('OC_JUDGE_API_BASE',
|
||||
'https://api.openai.com/v1/')
|
||||
],
|
||||
meta_template=dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], ),
|
||||
query_per_second=16,
|
||||
batch_size=1024,
|
||||
temperature=0.001,
|
||||
tokenizer_path='gpt-4o-2024-05-13',
|
||||
verbose=True,
|
||||
max_out_len=16384,
|
||||
max_seq_len=49152,
|
||||
)
|
||||
|
||||
return DEFAULT_JUDGE_CFG
|
||||
|
@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
|
||||
stop_words=model.get('stop_words', []),
|
||||
)
|
||||
elif accelerator == 'lmdeploy':
|
||||
|
||||
if model.get('generation_kwargs') is not None:
|
||||
logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
|
||||
gen_config = model['generation_kwargs'].copy()
|
||||
else:
|
||||
logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
|
||||
|
||||
mod = TurboMindModelwithChatTemplate
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
|
||||
session_len=model.get('max_seq_len', None),
|
||||
max_new_tokens=model['max_out_len']
|
||||
),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
|
||||
gen_config=gen_config,
|
||||
max_seq_len=model.get('max_seq_len', None),
|
||||
max_out_len=model['max_out_len'],
|
||||
batch_size=16,
|
||||
|
Loading…
Reference in New Issue
Block a user