mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
32 lines
1.5 KiB
Python
32 lines
1.5 KiB
Python
![]() |
import os
|
||
|
|
||
|
charm_memory_tasks = [
|
||
|
'Chinese_Anachronisms_Judgment',
|
||
|
'Chinese_Movie_and_Music_Recommendation',
|
||
|
'Chinese_Sport_Understanding',
|
||
|
'Chinese_Time_Understanding',
|
||
|
]
|
||
|
|
||
|
dataset_path = 'data/CHARM/memorization'
|
||
|
|
||
|
system_prompt_template = """Please act as an impartial judge, comparing the responses of the AI assistants to the reference answer and determining if the answers are correct.
|
||
|
You will receive the reference answer provided by a human and the responses of the AI assistants.
|
||
|
Your task is to judge whether the AI assistant's answers is correct.
|
||
|
{task_specific_prompt}
|
||
|
After providing your explanation, strictly output your final judgment in the following format: “[正确]” if the AI assistant's response is correct, “[错误]” if the AI assistant's response is incorrect.
|
||
|
"""
|
||
|
|
||
|
task_specific_prompts = {
|
||
|
'Chinese_Anachronisms_Judgment':
|
||
|
"If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list.",
|
||
|
'Chinese_Time_Understanding':
|
||
|
"When evaluating the AI assistant's response regarding Chinese solar terms, as long as the AI assistant's response falls within the time frame provided in the reference answer, consider it correct.",
|
||
|
'Chinese_Sport_Understanding':
|
||
|
"If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list."
|
||
|
}
|
||
|
|
||
|
judge_system_prompts = {
|
||
|
k: system_prompt_template.format(task_specific_prompt=v)
|
||
|
for k, v in task_specific_prompts.items()
|
||
|
}
|