mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* update gemini api and add gemini models * add openai models * update CHARM evaluation * add CHARM memorization tasks * add CharmMemSummarizer (output eval details for memorization-independent reasoning analysis * update CHARM readme --------- Co-authored-by: wujiang <wujiang@pjlab.org.cn>
32 lines
1.5 KiB
Python
32 lines
1.5 KiB
Python
import os
|
|
|
|
charm_memory_tasks = [
|
|
'Chinese_Anachronisms_Judgment',
|
|
'Chinese_Movie_and_Music_Recommendation',
|
|
'Chinese_Sport_Understanding',
|
|
'Chinese_Time_Understanding',
|
|
]
|
|
|
|
dataset_path = 'data/CHARM/memorization'
|
|
|
|
system_prompt_template = """Please act as an impartial judge, comparing the responses of the AI assistants to the reference answer and determining if the answers are correct.
|
|
You will receive the reference answer provided by a human and the responses of the AI assistants.
|
|
Your task is to judge whether the AI assistant's answers is correct.
|
|
{task_specific_prompt}
|
|
After providing your explanation, strictly output your final judgment in the following format: “[正确]” if the AI assistant's response is correct, “[错误]” if the AI assistant's response is incorrect.
|
|
"""
|
|
|
|
task_specific_prompts = {
|
|
'Chinese_Anachronisms_Judgment':
|
|
"If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list.",
|
|
'Chinese_Time_Understanding':
|
|
"When evaluating the AI assistant's response regarding Chinese solar terms, as long as the AI assistant's response falls within the time frame provided in the reference answer, consider it correct.",
|
|
'Chinese_Sport_Understanding':
|
|
"If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list."
|
|
}
|
|
|
|
judge_system_prompts = {
|
|
k: system_prompt_template.format(task_specific_prompt=v)
|
|
for k, v in task_specific_prompts.items()
|
|
}
|