mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix dataset-index.yml
This commit is contained in:
parent
5e8bfee3f4
commit
021c0d896a
@ -1,6 +1,5 @@
|
|||||||
from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
|
from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator, SciKnowEval_llmjudge_postprocess
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
from opencompass.datasets import generic_llmjudge_postprocess
|
|
||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
from opencompass.evaluator import GenericLLMEvaluator
|
from opencompass.evaluator import GenericLLMEvaluator
|
||||||
@ -91,7 +90,7 @@ eval_cfg_biology = dict(
|
|||||||
reader_cfg=reader_cfg,
|
reader_cfg=reader_cfg,
|
||||||
),
|
),
|
||||||
judge_cfg=dict(),
|
judge_cfg=dict(),
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
dict_postprocessor=dict(type=SciKnowEval_llmjudge_postprocess),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -121,7 +120,7 @@ eval_cfg_chemistry = dict(
|
|||||||
subset='chemistry',
|
subset='chemistry',
|
||||||
),
|
),
|
||||||
judge_cfg=dict(),
|
judge_cfg=dict(),
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
dict_postprocessor=dict(type=SciKnowEval_llmjudge_postprocess),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -151,7 +150,7 @@ eval_cfg_material = dict(
|
|||||||
subset='material',
|
subset='material',
|
||||||
),
|
),
|
||||||
judge_cfg=dict(),
|
judge_cfg=dict(),
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
dict_postprocessor=dict(type=SciKnowEval_llmjudge_postprocess),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -181,7 +180,7 @@ eval_cfg_physics = dict(
|
|||||||
subset='physics',
|
subset='physics',
|
||||||
),
|
),
|
||||||
judge_cfg=dict(),
|
judge_cfg=dict(),
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
dict_postprocessor=dict(type=SciKnowEval_llmjudge_postprocess),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -106,3 +106,75 @@ def answer_cleansing(
|
|||||||
prediction[0] = prediction[0][:-1]
|
prediction[0] = prediction[0][:-1]
|
||||||
|
|
||||||
return prediction[0]
|
return prediction[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _generic_llmjudge_postprocess(judgement: str):
|
||||||
|
match = re.search(r'(A|B)', judgement)
|
||||||
|
grade_letter = (match.group(0) if match else 'B'
|
||||||
|
) # Default to "INCORRECT" if no match
|
||||||
|
return grade_letter
|
||||||
|
|
||||||
|
|
||||||
|
def SciKnowEval_llmjudge_postprocess(
|
||||||
|
output: dict,
|
||||||
|
output_path: str,
|
||||||
|
dataset: Dataset,
|
||||||
|
) -> dict:
|
||||||
|
# Get the original dataset
|
||||||
|
original_dataset = dataset.reader.dataset['test']
|
||||||
|
|
||||||
|
judged_answers = []
|
||||||
|
original_responses = []
|
||||||
|
references = []
|
||||||
|
details = []
|
||||||
|
|
||||||
|
total_correct = 0
|
||||||
|
total_count = 0
|
||||||
|
|
||||||
|
for k, v in output.items():
|
||||||
|
idx = int(k) # Convert key to integer for indexing
|
||||||
|
original_responses.append(v['prediction'])
|
||||||
|
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
|
||||||
|
|
||||||
|
sample = original_dataset[idx]
|
||||||
|
# Record the judgment
|
||||||
|
if processed_judge is not None:
|
||||||
|
judged_answers.append(processed_judge)
|
||||||
|
try:
|
||||||
|
gold = v['gold']
|
||||||
|
references.append(gold)
|
||||||
|
except KeyError:
|
||||||
|
get_logger().warning(
|
||||||
|
f'No gold answer for {k}, use empty string as reference!')
|
||||||
|
gold = ''
|
||||||
|
references.append('')
|
||||||
|
|
||||||
|
# Check if the answer is correct (A means correct)
|
||||||
|
is_correct = processed_judge == 'A'
|
||||||
|
total_count += 1
|
||||||
|
|
||||||
|
if is_correct:
|
||||||
|
total_correct += 1
|
||||||
|
|
||||||
|
# Add to details
|
||||||
|
details.append({
|
||||||
|
'id': k,
|
||||||
|
'question': sample['question'],
|
||||||
|
'origin_prompt': v['origin_prompt'],
|
||||||
|
'llm_judge': processed_judge,
|
||||||
|
'gold': gold,
|
||||||
|
'is_correct': is_correct,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate overall accuracy with two decimal places
|
||||||
|
overall_accuracy = (round(
|
||||||
|
(total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
|
||||||
|
|
||||||
|
# Initialize results dictionary
|
||||||
|
results = {
|
||||||
|
'accuracy': overall_accuracy,
|
||||||
|
'total_correct': total_correct,
|
||||||
|
'total_count': total_count,
|
||||||
|
'details': details,
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
Loading…
Reference in New Issue
Block a user