[Update] Update OlympiadBench and Update LLM Judge (#1954)

2025-05-30 16:03:24 +08:00 · 2025-03-18 20:15:20 +08:00 · 2025-03-18 20:15:20 +08:00 · c98599271b
commit c98599271b
parent 5d2d253d83
5 changed files with 87 additions and 3 deletions
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -34,6 +34,23 @@ problem,answer

 ## Configuration

+### Using LLM for Evaluation via Command Line
+
+Some datasets in OpenCompass already include LLM judge configurations.
+You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
+
+Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
+```
+
+Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
+
+### ### Using LLM for Evaluation via Configuration Files
+
 To set up an LLM judge evaluation, you'll need to configure three main components:

 1. Dataset Reader Configuration
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -34,7 +34,24 @@ problem,answer

 ## 配置说明

-要设置LLM评判评估，你需要配置三个主要组件：
+### 基于命令行使用LLM进行评估
+
+OpenCompass中部分数据集已经包含了LLM评判器的配置。
+你需要使用一个模型服务（如OpenAI或DeepSeek官方提供的API）或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
+
+然后，你可以通过以下命令设置相关评估服务的环境变量，并对模型进行评估：
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 
+```
+
+注意，默认情况下，OpenCompass会使用这三个环境变量，但如果你使用了基于配置文件的方式配置评估服务，这三个环境变量将不会生效。
+
+### 基于配置文件使用LLM进行评估
+
+对一个数据集设置LLM评判评估，你需要配置三个主要组件：

 1. 数据集读取配置

--- a/opencompass/configs/summarizers/groups/OlympiadBench.py
+++ b/opencompass/configs/summarizers/groups/OlympiadBench.py
@ -16,7 +16,17 @@ math_categories = [
    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
 ]

+physics_categories = [
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
+

 OlympiadBenchMath_summary_groups = [
    {'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
 ]
+
+
+OlympiadBenchPhysics_summary_groups = [
+    {'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
+]
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@ -1,3 +1,4 @@
+import os
 import os.path as osp
 from typing import Dict, List, Optional

@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
    ) -> None:

        self.logger = get_logger()
-        self.judge_cfg = judge_cfg
+        # If judge_cfg is not provided, fall back to the default configuration
+        if not judge_cfg:
+            self.judge_cfg = self.default_judge_cfg
+        else:
+            self.judge_cfg = judge_cfg
        self.output_path = ''

        self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
            kwargs = self.dict_postprocessor
            proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
            return proc(output, self.output_path, **kwargs)
+
+    @property
+    def default_judge_cfg(self):
+        from opencompass.models import OpenAISDK
+
+        DEFAULT_JUDGE_CFG = dict(
+            type=OpenAISDK,
+            path=os.environ['OC_JUDGE_MODEL'],
+            key=os.environ['OC_JUDGE_API_KEY'],
+            openai_api_base=[
+                os.environ.get('OC_JUDGE_API_BASE',
+                               'https://api.openai.com/v1/')
+            ],
+            meta_template=dict(round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ], ),
+            query_per_second=16,
+            batch_size=1024,
+            temperature=0.001,
+            tokenizer_path='gpt-4o-2024-05-13',
+            verbose=True,
+            max_out_len=16384,
+            max_seq_len=49152,
+        )
+
+        return DEFAULT_JUDGE_CFG
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
                    stop_words=model.get('stop_words', []),
                )
            elif accelerator == 'lmdeploy':
+
+                if model.get('generation_kwargs') is not None:
+                    logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
+                    gen_config = model['generation_kwargs'].copy()
+                else:
+                    logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
+                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
+
                mod = TurboMindModelwithChatTemplate
                acc_model = dict(
                    type=f'{mod.__module__}.{mod.__name__}',
@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
                        session_len=model.get('max_seq_len', None),
                        max_new_tokens=model['max_out_len']
                    ),
-                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
+                    gen_config=gen_config,
                    max_seq_len=model.get('max_seq_len', None),
                    max_out_len=model['max_out_len'],
                    batch_size=16,