[Update] Update OlympiadBench and Update LLM Judge (#1954)

This commit is contained in:
Songyang Zhang 2025-03-18 20:15:20 +08:00 committed by GitHub
parent 5d2d253d83
commit c98599271b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 87 additions and 3 deletions

View File

@ -34,6 +34,23 @@ problem,answer
## Configuration
### Using LLM for Evaluation via Command Line
Some datasets in OpenCompass already include LLM judge configurations.
You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
```bash
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
export OC_JUDGE_API_KEY=sk-1234
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
```
Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
### ### Using LLM for Evaluation via Configuration Files
To set up an LLM judge evaluation, you'll need to configure three main components:
1. Dataset Reader Configuration

View File

@ -34,7 +34,24 @@ problem,answer
## 配置说明
要设置LLM评判评估你需要配置三个主要组件
### 基于命令行使用LLM进行评估
OpenCompass中部分数据集已经包含了LLM评判器的配置。
你需要使用一个模型服务如OpenAI或DeepSeek官方提供的API或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
然后,你可以通过以下命令设置相关评估服务的环境变量,并对模型进行评估:
```bash
export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
export OC_JUDGE_API_KEY=sk-1234
export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
```
注意默认情况下OpenCompass会使用这三个环境变量但如果你使用了基于配置文件的方式配置评估服务这三个环境变量将不会生效。
### 基于配置文件使用LLM进行评估
对一个数据集设置LLM评判评估你需要配置三个主要组件
1. 数据集读取配置

View File

@ -16,7 +16,17 @@ math_categories = [
'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
]
physics_categories = [
'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
]
OlympiadBenchMath_summary_groups = [
{'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
]
OlympiadBenchPhysics_summary_groups = [
{'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
]

View File

@ -1,3 +1,4 @@
import os
import os.path as osp
from typing import Dict, List, Optional
@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
) -> None:
self.logger = get_logger()
self.judge_cfg = judge_cfg
# If judge_cfg is not provided, fall back to the default configuration
if not judge_cfg:
self.judge_cfg = self.default_judge_cfg
else:
self.judge_cfg = judge_cfg
self.output_path = ''
self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
kwargs = self.dict_postprocessor
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
return proc(output, self.output_path, **kwargs)
@property
def default_judge_cfg(self):
from opencompass.models import OpenAISDK
DEFAULT_JUDGE_CFG = dict(
type=OpenAISDK,
path=os.environ['OC_JUDGE_MODEL'],
key=os.environ['OC_JUDGE_API_KEY'],
openai_api_base=[
os.environ.get('OC_JUDGE_API_BASE',
'https://api.openai.com/v1/')
],
meta_template=dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], ),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
max_seq_len=49152,
)
return DEFAULT_JUDGE_CFG

View File

@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
stop_words=model.get('stop_words', []),
)
elif accelerator == 'lmdeploy':
if model.get('generation_kwargs') is not None:
logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
gen_config = model['generation_kwargs'].copy()
else:
logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
mod = TurboMindModelwithChatTemplate
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
session_len=model.get('max_seq_len', None),
max_new_tokens=model['max_out_len']
),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
gen_config=gen_config,
max_seq_len=model.get('max_seq_len', None),
max_out_len=model['max_out_len'],
batch_size=16,