From c98599271be058e4c39f2e1a57b07e3f001caddd Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Tue, 18 Mar 2025 20:15:20 +0800 Subject: [PATCH] [Update] Update OlympiadBench and Update LLM Judge (#1954) --- docs/en/advanced_guides/llm_judge.md | 17 ++++++++++ docs/zh_cn/advanced_guides/llm_judge.md | 19 ++++++++++- .../summarizers/groups/OlympiadBench.py | 10 ++++++ .../evaluator/generic_llm_evaluator.py | 34 ++++++++++++++++++- opencompass/utils/run.py | 10 +++++- 5 files changed, 87 insertions(+), 3 deletions(-) diff --git a/docs/en/advanced_guides/llm_judge.md b/docs/en/advanced_guides/llm_judge.md index 91a1a5bf..1d9e9760 100644 --- a/docs/en/advanced_guides/llm_judge.md +++ b/docs/en/advanced_guides/llm_judge.md @@ -34,6 +34,23 @@ problem,answer ## Configuration +### Using LLM for Evaluation via Command Line + +Some datasets in OpenCompass already include LLM judge configurations. +You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang. + +Then, you can set the environment variables for the evaluation service and evaluate models using the following commands: + +```bash +export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct +export OC_JUDGE_API_KEY=sk-1234 +export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 +``` + +Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect. + +### ### Using LLM for Evaluation via Configuration Files + To set up an LLM judge evaluation, you'll need to configure three main components: 1. Dataset Reader Configuration diff --git a/docs/zh_cn/advanced_guides/llm_judge.md b/docs/zh_cn/advanced_guides/llm_judge.md index 66d288a8..bc49696e 100644 --- a/docs/zh_cn/advanced_guides/llm_judge.md +++ b/docs/zh_cn/advanced_guides/llm_judge.md @@ -34,7 +34,24 @@ problem,answer ## 配置说明 -要设置LLM评判评估,你需要配置三个主要组件: +### 基于命令行使用LLM进行评估 + +OpenCompass中部分数据集已经包含了LLM评判器的配置。 +你需要使用一个模型服务(如OpenAI或DeepSeek官方提供的API)或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。 + +然后,你可以通过以下命令设置相关评估服务的环境变量,并对模型进行评估: + +```bash +export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct +export OC_JUDGE_API_KEY=sk-1234 +export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 +``` + +注意,默认情况下,OpenCompass会使用这三个环境变量,但如果你使用了基于配置文件的方式配置评估服务,这三个环境变量将不会生效。 + +### 基于配置文件使用LLM进行评估 + +对一个数据集设置LLM评判评估,你需要配置三个主要组件: 1. 数据集读取配置 diff --git a/opencompass/configs/summarizers/groups/OlympiadBench.py b/opencompass/configs/summarizers/groups/OlympiadBench.py index fc57f603..e30831ff 100644 --- a/opencompass/configs/summarizers/groups/OlympiadBench.py +++ b/opencompass/configs/summarizers/groups/OlympiadBench.py @@ -16,7 +16,17 @@ math_categories = [ 'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE ] +physics_categories = [ + 'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP + 'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE +] + OlympiadBenchMath_summary_groups = [ {'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]}, ] + + +OlympiadBenchPhysics_summary_groups = [ + {'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]}, +] diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py index 131c2e75..c0b33a69 100644 --- a/opencompass/evaluator/generic_llm_evaluator.py +++ b/opencompass/evaluator/generic_llm_evaluator.py @@ -1,3 +1,4 @@ +import os import os.path as osp from typing import Dict, List, Optional @@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator): ) -> None: self.logger = get_logger() - self.judge_cfg = judge_cfg + # If judge_cfg is not provided, fall back to the default configuration + if not judge_cfg: + self.judge_cfg = self.default_judge_cfg + else: + self.judge_cfg = judge_cfg self.output_path = '' self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template) @@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator): kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) return proc(output, self.output_path, **kwargs) + + @property + def default_judge_cfg(self): + from opencompass.models import OpenAISDK + + DEFAULT_JUDGE_CFG = dict( + type=OpenAISDK, + path=os.environ['OC_JUDGE_MODEL'], + key=os.environ['OC_JUDGE_API_KEY'], + openai_api_base=[ + os.environ.get('OC_JUDGE_API_BASE', + 'https://api.openai.com/v1/') + ], + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], ), + query_per_second=16, + batch_size=1024, + temperature=0.001, + tokenizer_path='gpt-4o-2024-05-13', + verbose=True, + max_out_len=16384, + max_seq_len=49152, + ) + + return DEFAULT_JUDGE_CFG diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 2443b829..772c0a8a 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -313,6 +313,14 @@ def change_accelerator(models, accelerator): stop_words=model.get('stop_words', []), ) elif accelerator == 'lmdeploy': + + if model.get('generation_kwargs') is not None: + logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode') + gen_config = model['generation_kwargs'].copy() + else: + logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose') + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9) + mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', @@ -324,7 +332,7 @@ def change_accelerator(models, accelerator): session_len=model.get('max_seq_len', None), max_new_tokens=model['max_out_len'] ), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), + gen_config=gen_config, max_seq_len=model.get('max_seq_len', None), max_out_len=model['max_out_len'], batch_size=16,