[Update] Update OlympiadBench and Update LLM Judge (#1954)

2025-05-30 16:03:24 +08:00 · 2025-03-18 20:15:20 +08:00 · 2025-03-18 20:15:20 +08:00 · c98599271b
commit c98599271b
parent 5d2d253d83
5 changed files with 87 additions and 3 deletions
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -34,6 +34,23 @@ problem,answer
 ## Configuration
 ### Using LLM for Evaluation via Command Line
 Some datasets in OpenCompass already include LLM judge configurations.
 You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
 Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
 ```bash
 export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
 export OC_JUDGE_API_KEY=sk-1234
 export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
 ```
 Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
 ### ### Using LLM for Evaluation via Configuration Files
 To set up an LLM judge evaluation, you'll need to configure three main components:
 1. Dataset Reader Configuration
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -34,7 +34,24 @@ problem,answer
 ## 配置说明
-要设置LLM评判评估，你需要配置三个主要组件：
+### 基于命令行使用LLM进行评估
 OpenCompass中部分数据集已经包含了LLM评判器的配置。
 你需要使用一个模型服务（如OpenAI或DeepSeek官方提供的API）或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
 然后，你可以通过以下命令设置相关评估服务的环境变量，并对模型进行评估：
 ```bash
 export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
 export OC_JUDGE_API_KEY=sk-1234
 export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 
 ```
 注意，默认情况下，OpenCompass会使用这三个环境变量，但如果你使用了基于配置文件的方式配置评估服务，这三个环境变量将不会生效。
 ### 基于配置文件使用LLM进行评估
 对一个数据集设置LLM评判评估，你需要配置三个主要组件：
 1. 数据集读取配置
--- a/opencompass/configs/summarizers/groups/OlympiadBench.py
+++ b/opencompass/configs/summarizers/groups/OlympiadBench.py
@ -16,7 +16,17 @@ math_categories = [
    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
 ]
 physics_categories = [
    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
 ]
 OlympiadBenchMath_summary_groups = [
    {'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
 ]
 OlympiadBenchPhysics_summary_groups = [
    {'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
 ]
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@ -1,3 +1,4 @@
 import os
 import os.path as osp
 from typing import Dict, List, Optional
@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
    ) -> None:
        self.logger = get_logger()
-        self.judge_cfg = judge_cfg
+        # If judge_cfg is not provided, fall back to the default configuration
        if not judge_cfg:
            self.judge_cfg = self.default_judge_cfg
        else:
            self.judge_cfg = judge_cfg
        self.output_path = ''
        self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
            kwargs = self.dict_postprocessor
            proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
            return proc(output, self.output_path, **kwargs)
    @property
    def default_judge_cfg(self):
        from opencompass.models import OpenAISDK
        DEFAULT_JUDGE_CFG = dict(
            type=OpenAISDK,
            path=os.environ['OC_JUDGE_MODEL'],
            key=os.environ['OC_JUDGE_API_KEY'],
            openai_api_base=[
                os.environ.get('OC_JUDGE_API_BASE',
                               'https://api.openai.com/v1/')
            ],
            meta_template=dict(round=[
                dict(role='HUMAN', api_role='HUMAN'),
                dict(role='BOT', api_role='BOT', generate=True),
            ], ),
            query_per_second=16,
            batch_size=1024,
            temperature=0.001,
            tokenizer_path='gpt-4o-2024-05-13',
            verbose=True,
            max_out_len=16384,
            max_seq_len=49152,
        )
        return DEFAULT_JUDGE_CFG
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
                    stop_words=model.get('stop_words', []),
                )
            elif accelerator == 'lmdeploy':
                if model.get('generation_kwargs') is not None:
                    logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
                    gen_config = model['generation_kwargs'].copy()
                else:
                    logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
                mod = TurboMindModelwithChatTemplate
                acc_model = dict(
                    type=f'{mod.__module__}.{mod.__name__}',
@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
                        session_len=model.get('max_seq_len', None),
                        max_new_tokens=model['max_out_len']
                    ),
-                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
+                    gen_config=gen_config,
                    max_seq_len=model.get('max_seq_len', None),
                    max_out_len=model['max_out_len'],
                    batch_size=16,