diff --git a/examples/eval_OpenHuEval_HuSimpleQA.py b/examples/eval_OpenHuEval_HuSimpleQA.py index 68a80de0..05e3cb3a 100644 --- a/examples/eval_OpenHuEval_HuSimpleQA.py +++ b/examples/eval_OpenHuEval_HuSimpleQA.py @@ -19,7 +19,7 @@ with read_base(): from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model - from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model + # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model from opencompass.models import OpenAI from opencompass.partitioners import ( diff --git a/opencompass/summarizers/subjective/husimpleqa.py b/opencompass/summarizers/subjective/husimpleqa.py index 709a6f22..03d2f594 100644 --- a/opencompass/summarizers/subjective/husimpleqa.py +++ b/opencompass/summarizers/subjective/husimpleqa.py @@ -44,12 +44,12 @@ def get_capability_results( col_name = ['model'] column = [model_abbr] - # for dim, judges in chain({"total": dim_judges.pop('total')}.items(), dim_judges.items()): for dim, judges in dim_judges.items(): c = Counter(judges) dim_count = dim_counts[dim] - for judge, count in c.items(): - col_name.append(dim + '-' + judge) + for judge in ['correct', 'incorrect', 'not_attempted']: + count = c[judge] + col_name.append(dim + ' ' + judge) column.append(round(count / dim_count, 2)) col_name.append(dim + ' count') column.append(dim_count)