[Refactor] Change HuSimpleQA to subjective evaluation

This commit is contained in:
hoteryoung 2025-02-11 16:55:07 +08:00 committed by jxd
parent 60ab611ecd
commit 23210e089a
8 changed files with 417 additions and 260 deletions

View File

@ -1,11 +1,13 @@
from mmengine.config import read_base
from opencompass.summarizers.subjective.husimpleqa import HuSimpleQASummarizer
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuSimpleQA.HuSimpleQA import HuSimpleQA_datasets
# from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
# from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model
# from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model
from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
@ -15,11 +17,24 @@ with read_base():
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
# from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
# from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model
from opencompass.models import OpenAI
from opencompass.partitioners import (
NumWorkerPartitioner,
SubjectiveNumWorkerPartitioner,
)
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
datasets = HuSimpleQA_datasets
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
@ -32,4 +47,53 @@ for model in models:
}
del model
work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable
judge_models = [
dict(
abbr='GPT-4o-2024-08-06',
type=OpenAI,
path='gpt-4o-2024-08-06',
key='ENV',
openai_proxy_url='ENV',
verbose=True,
meta_template=api_meta_template,
query_per_second=2,
max_out_len=8192,
max_seq_len=16384,
batch_size=8,
temperature=0,
)
]
datasets = HuSimpleQA_datasets
del HuSimpleQA_datasets
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8,
),
runner=dict(
type=SlurmSequentialRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
),
)
eval = dict(
partitioner=dict(
type=SubjectiveNumWorkerPartitioner,
num_worker=8,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=HuSimpleQASummarizer)
work_dir = (
'./outputs/' + __file__.split('/')[-1].split('.')[0] + '/'
) # do NOT modify this line, yapf: disable, pylint: disable

View File

@ -1,20 +1,20 @@
from mmengine.config import read_base
from opencompass.datasets.OpenHuEval.HuSimpleQA import HuSimpleQADataset
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.OpenHuEval.HuSimpleQA import HuSimpleQADataset, HuSimpleQAEvaluator
with read_base():
from .HuSimpleQA_setting import INSTRUCTIONS, DATA_PATH, DATA_VERSION, JUDGE_PROMPT
from .HuSimpleQA_setting import DATA_PATH, DATA_VERSION, INSTRUCTIONS
PROMPT_LANGUAGES = [
'en',
'hu',
]
PROMPT_LANGUAGES = ['en', 'hu']
HuSimpleQA_reader_cfg = dict(input_columns=['question', 'hu_specific_dim'],
output_column='reference')
HuSimpleQA_reader_cfg = dict(
input_columns=['question', 'prompt'],
output_column='references',
)
HuSimpleQA_datasets = []
for lang in PROMPT_LANGUAGES:
@ -32,13 +32,23 @@ for lang in PROMPT_LANGUAGES:
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
inferencer=dict(
type=GenInferencer,
max_seq_len=8192,
max_out_len=8192,
),
)
HuSimpleQA_eval_cfg = dict(evaluator=dict(
type=HuSimpleQAEvaluator,
judge_prompt_template=JUDGE_PROMPT,
))
HuSimpleQA_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
),
pred_role='BOT',
)
HuSimpleQA_datasets.append(
dict(
@ -48,4 +58,5 @@ for lang in PROMPT_LANGUAGES:
reader_cfg=HuSimpleQA_reader_cfg,
infer_cfg=HuSimpleQA_infer_cfg,
eval_cfg=HuSimpleQA_eval_cfg,
mode='singlescore'
))

View File

@ -17,119 +17,6 @@ Kérjük, magyar nyelven adja meg a legjobb választ erre a kérdésre, és 0-t
"""
}
JUDGE_PROMPT = {
'system_prompt':
"""
- Role: Hungarian General Knowledge Quiz Review Expert
- Background: Hungarian General Knowledge Quiz involves many aspects of Hungarian history, culture, geography, politics, people, etc., and a professional review expert is needed to ensure the accuracy of the answers.
- Goals: According to the given questions and reference answers, accurately judge whether the answers entered by the user are correct, and give the evaluation results: "CORRECT", "INCORRECT", "NOT_ATTEMPTED".
- Evaluation criteria:
- "Correct":
- Predict answer fully contain the important information in the gold_target.
- Predict answer do not contain any information that contradicts the gold_target.
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the gold_target is fully included and the response contains no incorrect information or contradictions.
- The answer must be consistent with the scope of the question. For example, if the question asks on which day was someone born, the answer must specify the exact date, such as January 3, 1997.
- "Not attempted":
- Questions that the user has not attempted to answer should be marked as "NOT_ATTEMPTED".
- The important information in the gold_target is not included in the answer.
- No statements in the answer contradict the gold_target.
- Workflow:
1. Receive questions, reference answers, and user answers.
2. Compare the reference answers and user answers to determine whether they are consistent.
3. Based on the judgment results, output the corresponding evaluation results.
- Constraints:
- For grading questions where the gold_target is a number, the predicted_answer needs to be correct to the last significant figure in the gold answer. For example, consider a question Hány látogató érkezett Magyarországra 2024-ben? with gold_target 440k.
- predicted_answers 440k, 444k, and 435k are all CORRECT.
- predicted_answers 400k and 413k are INCORRECT.
- predicted_answers körülbelül 400k and több mint 300k are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold_target.
- The gold_target may contain more information than the question. In such cases, the predicted_answer only needs to contain the information that is in the question.
- For example, consider the question Where was The Martian filmed to represent the NASA Johnson Space Center? with the gold_target Budapesti Bálna (HU). Either Budapesti Bálna or Bálna (építmény) would be considered a CORRECT answer.
- Do not punish predicted_answers if they omit information that would be clearly inferred from the question.
- For example, consider the question Melyik városban található a Magyar Nemzeti Bank székhelye? with the gold_target Budapest, Magyarország. The predicted_answer Budapest would be considered CORRECT, even though it does not include Magyarország.
- Consider the question Milyen díjat nyert a Napsugár című magyar film a 2018-as Velencei Filmfesztiválon? with the gold_target Legjobb Forgatókönyv Díja. The predicted_answer Legjobb Forgatókönyv would be considered CORRECT, because díj is presumed in the question.
- For the question Mi Dominik Szoboszlai magassága méterben? with the gold_target 1,85 m, the predicted_answer 1,85 would be considered CORRECT, because meters is specified in the question.
- For the question Mi Magyarország első női miniszterelnökének neve? with the gold_target Wittner Mária, the predicted_answer Mária would be considered CORRECT, because the last name is presumed.
- Do not punish for typos in peoples names if its clearly the same name.
- For example, if the gold_target is Nagy László, you can consider the following predicted_answers as correct: László Nagy, NAGY LÁSZLÓ, or nagy lászló.
Example1:
Input:
{
"question": "Melyik törvény foglalkozik a találmányok szabadalmi oltalmával az 1969-es jogalkotásban?",
"gold_target": "1969. évi II. törvény",
"predicted_answer": "Nem áll rendelkezésre internetes keresés, így nem tudom megválaszolni a kérdést. Azonban 1969-ben valóban elfogadták a szabadalmi védelmi törvényt."
}
Output:
{
"evaluation": "NOT_ATTEMPTED"
}
Example2:
Input:
{
"question": "Melyik városban született Tisza Lajos 1832. szeptember 12-én?",
"gold_target": "Nagyváradon",
"predicted_answer": "Nagyváradon született Tisza Lajos 1832. szeptember 12-én."
}
Output: # Although the answer is long, it accurately answers the question
{
"evaluation": "CORRECT"
}
Example3:
Input:
{
"question": "Melyik olimpián vett részt Bogár Pál kosárlabda-játékosként?",
"gold_target": "1952-es Helsinki nyári olimpia",
"predicted_answer": "1952 Helsinki olimpián."
}
Output: # The descriptions are slightly different, but they all refer to the same Olympic Games, so they are considered correct
{
"evaluation": "CORRECT"
}
Example4:
Input:
{
"question": "Melyik labdarúgócsapat kötődik Budapest XIX. kerületéhez, amely 14-szeres magyar bajnok?",
"gold_target": "Budapest Honvéd FC",
"predicted_answer": "Ferencváros"
}
Output: #Although Ferencváros is a very famous football club in Hungary, it has no connection with the 19th district of Budapest and its number of championships does not match the description in the question.
{
"evaluation": "INCORRECT"
}
Example5:
Input:
{
"question": "Milyen biztosítás bevezetését szabályozta egy 1952-es törvényerejű rendelet Magyarországon?",
"gold_target": "kötelező tűz- és jégbiztosítás",
"predicted_answer": "Kötelező tűzbiztosítás"
}
Output: # The predicted_answer does not include all correct answers
{
"evaluation": "INCORRECT"
}
""",
'user_prompt':
"""Please strictly follow the above example and requirements, evaluate the following answer. Input:
{{
"question": {question},
"gold_target": {answer},
"predicted_answer": {pred_answer}
}}
Please respond strictly in JSON format. Do not include any additional text outside the JSON structure.
Output:
{{
"evaluation":"Correct"/"Incorrect"/"NOT_ATTEMPTED"
}}
"""
}
OpenHuEval_Path = '/mnt/hwfile/opendatalab/weixingjian/OpenHuEval'
DATA_VERSION = '250208'
DATA_PATH = f'{OpenHuEval_Path}/data/HuSimpleQA/HuSimpleQA_{DATA_VERSION}/HuSimpleQA.jsonl'

View File

@ -1,141 +1,220 @@
import json
import os
import re
from datasets import Dataset, DatasetDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils.prompt import PromptList
from ..base import BaseDataset
JUDGE_PROMPT = {
'system_prompt':
"""
- Role: Hungarian General Knowledge Quiz Review Expert
- Background: Hungarian General Knowledge Quiz involves many aspects of \
Hungarian history, culture, geography, politics, people, etc., \
and a professional review expert is needed to ensure the accuracy \
of the answers.
- Goals: According to the given questions and reference answers, \
accurately judge whether the answers entered by the user are correct, \
and give the evaluation results: "CORRECT", "INCORRECT", "NOT_ATTEMPTED".
- Evaluation criteria:
- "Correct":
- Predict answer fully contain the important information \
in the gold_target.
- Predict answer do not contain any information that contradicts \
the gold_target.
- Only semantic meaning matters; capitalization, punctuation, \
grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the \
gold_target is fully included and the response contains \
no incorrect information or contradictions.
- The answer must be consistent with the scope of the question. \
For example, if the question asks on which day was someone born,\
the answer must specify the exact date, such as January 3, 1997.
- "Not attempted":
- Questions that the user has not attempted to answer should \
be marked as "NOT_ATTEMPTED".
- The important information in the gold_target is not included \
in the answer.
- No statements in the answer contradict the gold_target.
- Workflow:
1. Receive questions, reference answers, and user answers.
2. Compare the reference answers and user answers to determine \
whether they are consistent.
3. Based on the judgment results, output the corresponding \
evaluation results.
- Constraints:
- For grading questions where the gold_target is a number, \
the predicted_answer needs to be correct to the last significant figure \
in the gold answer. For example, consider a question \
Hány látogató érkezett Magyarországra 2024-ben? with gold_target 440k.
- predicted_answers 440k, 444k, and 435k are all CORRECT.
- predicted_answers 400k and 413k are INCORRECT.
- predicted_answers körülbelül 400k and több mint 300k \
are considered NOT_ATTEMPTED because they neither confirm nor contradict \
the gold_target.
- The gold_target may contain more information than the question. \
In such cases, the predicted_answer only needs to contain the information \
that is in the question.
- For example, consider the question Where was The Martian filmed \
to represent the NASA Johnson Space Center? with the gold_target \
Budapesti Bálna (HU). Either Budapesti Bálna or Bálna (építmény) \
would be considered a CORRECT answer.
- Do not punish predicted_answers if they omit information that \
would be clearly inferred from the question.
- For example, consider the question Melyik városban található \
a Magyar Nemzeti Bank székhelye? with the gold_target \
Budapest, Magyarország. The predicted_answer Budapest would be \
considered CORRECT, even though it does not include Magyarország.
- Consider the question Milyen díjat nyert a Napsugár \
című magyar film a 2018-as Velencei Filmfesztiválon? with the gold_target \
Legjobb Forgatókönyv Díja. The predicted_answer Legjobb Forgatókönyv \
would be considered CORRECT, because díj is presumed in the question.
- For the question Mi Dominik Szoboszlai magassága méterben? \
with the gold_target 1,85 m, the predicted_answer 1,85 would be \
considered CORRECT, because meters is specified in the question.
- For the question Mi Magyarország első női miniszterelnökének neve?\
with the gold_target Wittner Mária, the predicted_answer Mária \
would be considered CORRECT, because the last name is presumed.
- Do not punish for typos in peoples names if its clearly the same name.
- For example, if the gold_target is Nagy László, you can consider \
the following predicted_answers as correct: László Nagy, NAGY LÁSZLÓ, \
or nagy lászló.
Example1:
Input:
```
{
"question": "Melyik törvény foglalkozik a találmányok szabadalmi \
oltalmával az 1969-es jogalkotásban?",
"gold_target": "1969. évi II. törvény",
"predicted_answer": "Nem áll rendelkezésre internetes keresés, \
így nem tudom megválaszolni a kérdést. Azonban 1969-ben valóban elfogadták \
a szabadalmi védelmi törvényt."
}
```
Output:
```
{
"evaluation": "NOT_ATTEMPTED"
}
```
Example2:
Input:
```
{
"question": "Melyik városban született Tisza Lajos 1832. \
szeptember 12-én?",
"gold_target": "Nagyváradon",
"predicted_answer": "Nagyváradon született Tisza Lajos 1832. \
szeptember 12-én."
}
```
Output: # Although the answer is long, it accurately answers the question
```
{
"evaluation": "CORRECT"
}
```
Example3:
```
Input:
{
"question": "Melyik olimpián vett részt Bogár Pál kosárlabda-játékosként?",
"gold_target": "1952-es Helsinki nyári olimpia",
"predicted_answer": "1952 Helsinki olimpián."
}
```
Output: # The descriptions are slightly different, but they all refer to \
the same Olympic Games, so they are considered correct
```
{
"evaluation": "CORRECT"
}
```
Example4:
Input:
```
{
"question": "Melyik labdarúgócsapat kötődik Budapest XIX. kerületéhez, \
amely 14-szeres magyar bajnok?",
"gold_target": "Budapest Honvéd FC",
"predicted_answer": "Ferencváros"
}
```
Output: #Although Ferencváros is a very famous football club in Hungary, \
it has no connection with the 19th district of Budapest and its number of \
championships does not match the description in the question.
```
{
"evaluation": "INCORRECT"
}
```
Example5:
Input:
```
{
"question": "Milyen biztosítás bevezetését szabályozta egy 1952-es \
törvényerejű rendelet Magyarországon?",
"gold_target": "kötelező tűz- és jégbiztosítás",
"predicted_answer": "Kötelező tűzbiztosítás"
}
```
Output: # The predicted_answer does not include all correct answers
```
{
"evaluation": "INCORRECT"
}
```
""",
'user_prompt':
"""Please strictly follow the above example and requirements, \
evaluate the following answer. Input:
```
{
"question": {question},
"gold_target": {answer},
"predicted_answer": {prediction}
}
```
Please respond strictly in JSON format. Do not include any additional text \
outside the JSON structure.
Output:
Please provide your evaluation results in the following json format by \
filling in the placeholders in []:
```
{
"evaluation": ["CORRECT"/"INCORRECT"/"NOT_ATTEMPTED" ]
}
```"""
}
class HuSimpleQADataset(BaseDataset):
@staticmethod
def load(filepath):
def load(filepath, *args, **kwargs):
assert os.path.isfile(filepath)
assert filepath.endswith('.jsonl')
dataset = DatasetDict()
f = open(filepath, 'r', encoding='utf-8')
lines = f.readlines()
objs = []
for line in lines:
obj = json.loads(line)
objs.append(obj)
out_dict_list = []
with open(filepath, 'r', encoding='utf-8') as fp:
objs = [json.loads(line) for line in fp.readlines()]
raw_data = []
for obj in objs:
question = obj['question']
hu_specific_dim = obj['hu_specific_dim']
tmp = obj
new_obj = dict(question=question,
hu_specific_dim=hu_specific_dim,
reference=tmp)
out_dict_list.append(new_obj)
dataset = Dataset.from_list(out_dict_list)
answer = obj['answer']
user_prompt = JUDGE_PROMPT['user_prompt']
user_prompt = user_prompt.replace('{question}', question)
user_prompt = user_prompt.replace('{answer}', answer)
raw_data.append(
dict(question=question,
prompt=JUDGE_PROMPT['system_prompt'] + user_prompt,
references=obj))
dataset = Dataset.from_list(raw_data)
return dataset
class HuSimpleQAEvaluator(BaseEvaluator):
def __init__(self,
judge_prompt_template,
openai_key='ENV',
openai_proxy_url='ENV',
**kwargs):
super().__init__(**kwargs)
self.judge_prompt_template = judge_prompt_template
self.openai_key = openai_key
self.openai_proxy_url = openai_proxy_url
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
total, correct, wrong, not_attempted, failed_to_parse = 0, 0, 0, 0, 0
from opencompass.models import OpenAI
model = OpenAI(path='gpt-4o-2024-08-06',
key=self.openai_key,
openai_proxy_url=self.openai_proxy_url,
max_seq_len=8192,
retry=2,
temperature=0,
verbose=True)
confidence_scores = []
for raw_pred, detail in zip(predictions, references):
total += 1
qid = detail['qid']
details[qid] = {
'question': detail['question'],
'answer': detail['answer'],
'raw_pred': raw_pred,
'correctness': False,
'failed_to_parse': False
}
# parse raw_pred
try:
raw_pred = re.sub(r'^```json\n|\n```$', '', raw_pred)
raw_pred_json = json.loads(raw_pred)
confidence_score = raw_pred_json.get('confidence_score', None)
except json.JSONDecodeError:
confidence_score = None
details[qid]['confidence_score'] = confidence_score
# ------------------------ involve openai gpt4o as judge
user_prompt = self.judge_prompt_template['user_prompt'].format(
question=detail['question'],
answer=detail['answer'],
pred_answer=raw_pred)
system_prompt = self.judge_prompt_template['system_prompt']
details[qid]['judge_user_prompt'] = user_prompt
messages = PromptList([{
'role': 'SYSTEM',
'prompt': system_prompt,
}, {
'role': 'HUMAN',
'prompt': user_prompt,
}])
response = model._generate(input=messages,
max_out_len=8192,
temperature=0.1)
details[qid]['judge_resp'] = response
try:
response = re.sub(r'^```json\n|\n```$', '', response)
evaluation_result = json.loads(response)
evaluation = evaluation_result.get('evaluation', '').lower()
details[qid]['correctness'] = (evaluation == 'correct')
details[qid]['failed_to_parse'] = False
if evaluation == 'correct':
correct += 1
elif evaluation == 'incorrect':
wrong += 1
elif evaluation == 'not_attempted':
not_attempted += 1
else:
failed_to_parse += 1
except json.JSONDecodeError:
details[qid]['failed_to_parse'] = True
failed_to_parse += 1
confidence_scores.append(
(confidence_score, details[qid]['correctness']))
accuracy = correct / total if total > 0 else 0
results = {
'accuracy': accuracy,
'total': total,
'correct': correct,
'wrong': wrong,
'not_attempted': not_attempted,
'failed_to_parse': failed_to_parse,
'details': details,
'confidence_scores': confidence_scores
}
return results

View File

@ -1,3 +1,4 @@
from .HuMatchingFIB import * # noqa: F401, F403
from .HuProverbRea import * # noqa: F401, F403
from .HuSimpleQA import * # noqa: F401, F403
from .HuStandardFIB import * # noqa: F401, F403

View File

@ -104,6 +104,7 @@ from .natural_question_cn import * # noqa: F401, F403
from .NPHardEval import * # noqa: F401, F403
from .obqa import * # noqa: F401, F403
from .OpenFinData import * # noqa: F401, F403
from .OpenHuEval import * # noqa: F401, F403
from .piqa import * # noqa: F401, F403
from .py150 import * # noqa: F401, F403
from .qasper import * # noqa: F401, F403
@ -145,4 +146,3 @@ from .xcopa import * # noqa: F401, F403
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
from .xlsum import * # noqa: F401, F403
from .xsum import * # noqa: F401, F403
from .OpenHuEval import * # noqa: F401, F403

View File

@ -13,6 +13,7 @@ from .creationbench import CreationBenchSummarizer
from .flames import FlamesSummarizer
from .fofo import FofoSummarizer
from .followbench import FollowBenchSummarizer
from .husimpleqa import HuSimpleQASummarizer
from .mtbench import MTBenchSummarizer
from .mtbench101 import MTBench101Summarizer
from .multiround import MultiroundSummarizer

View File

@ -0,0 +1,114 @@
# flake8: noqa
# yapf: disable
import csv
import os
import os.path as osp
import re
from collections import Counter, defaultdict
from datetime import datetime
from mmengine import ConfigDict
from opencompass.utils import model_abbr_from_cfg
from .compass_arena import model_abbr_from_cfg_used_in_summarizer
from .utils import get_judgeanswer_and_reference, get_outdir
def post_process_husimpleqa(judgement: str):
pattern = r'\"evaluation\": \"(.*?)\"'
matched_result = re.findall(pattern, judgement)
try:
judge = matched_result[0].lower()
return {'judge': judge}
except (ValueError, IndexError) as e:
return None
def get_capability_results(
judged_answers,
references,
fout,
fout_flag,
model_abbr,
):
dim_judges = defaultdict(list)
dim_counts = defaultdict(float)
for ans, ref in zip(judged_answers, references):
dim_judges['total'].append(ans)
dim_counts['total'] += 1
dim = ref['hu_specific_dim']
dim_judges[dim].append(ans)
dim_counts[dim] += 1
col_name = ['model']
column = [model_abbr]
# for dim, judges in chain({"total": dim_judges.pop('total')}.items(), dim_judges.items()):
for dim, judges in dim_judges.items():
c = Counter(judges)
dim_count = dim_counts[dim]
for judge, count in c.items():
col_name.append(dim + '-' + judge)
column.append(round(count / dim_count, 2))
col_name.append(dim + ' count')
column.append(dim_count)
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(col_name)
writer.writerow(column)
class HuSimpleQASummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
"""
def __init__(self, config: ConfigDict) -> None:
self.judge_type = 'single'
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_function = post_process_husimpleqa
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_cfg in self.eval_model_cfgs:
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
if os.path.isdir(subdir_path):
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
judged_answers = [item['judge'] for item in judged_answers]
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(
overall_judged_answers,
overall_references,
fout,
fout_flag,
show_model_abbr,
)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')