From 6aabba778da7a31ffb4258c3b5bd19c9e9c76327 Mon Sep 17 00:00:00 2001 From: bio-mlhui Date: Thu, 29 May 2025 07:12:42 +0000 Subject: [PATCH] fix bench --- opencompass/datasets/healthbench/healthbench.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opencompass/datasets/healthbench/healthbench.py b/opencompass/datasets/healthbench/healthbench.py index 8e12bbc4..2ec84608 100644 --- a/opencompass/datasets/healthbench/healthbench.py +++ b/opencompass/datasets/healthbench/healthbench.py @@ -1,5 +1,6 @@ import hashlib import json +import os import re from collections import defaultdict from typing import Literal @@ -260,7 +261,7 @@ class HealthBenchEvaluator(BaseEvaluator): self.n_repeats = n_repeats self.n_threads = n_threads self.subset_name = subset_name - self.grader_model = ChatCompletionSampler(model='gpt-4.1-2025-04-14', system_message=OPENAI_SYSTEM_MESSAGE_API, max_tokens=2048,) # noqa: E501 + self.grader_model = ChatCompletionSampler(model=os.environ['OC_JUDGE_MODEL'], system_message=OPENAI_SYSTEM_MESSAGE_API, max_tokens=2048,) # noqa: E501 def grade_sample(self, prompt: list[dict[str, str]], response_text: str, example_tags: list[str], rubric_items: list[RubricItem], ) -> tuple[dict, str, list[dict]]: # noqa: E501 # construct and grade the sample