This commit is contained in:
MaiziXiao 2025-03-11 09:32:35 +00:00
parent 7938f352d7
commit 7f31ef7357
8 changed files with 205 additions and 196 deletions

View File

@ -57,7 +57,7 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a> ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测欢迎尝试🔥🔥🔥 - **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测欢迎尝试🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
- **\[2025.02.15\]** 我们新增了两个实用的评测工具用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥 - **\[2025.02.15\]** 我们新增了两个实用的评测工具用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。

View File

@ -11,13 +11,13 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
reader_cfg = dict( reader_cfg = dict(
input_columns=[ input_columns=[
'question', 'question',
"options", 'options',
'discipline', 'discipline',
'field', 'field',
'subfield', 'subfield',
'difficulty', 'difficulty',
"infer_prompt", 'infer_prompt',
"prompt_mode", 'prompt_mode',
], ],
output_column='answer_letter', output_column='answer_letter',
) )
@ -47,7 +47,7 @@ eval_cfg = dict(
supergpqa_dataset = dict( supergpqa_dataset = dict(
type=SuperGPQADataset, type=SuperGPQADataset,
abbr='supergpqa', abbr='supergpqa',
path="m-a-p/SuperGPQA", path='m-a-p/SuperGPQA',
prompt_mode='zero-shot', prompt_mode='zero-shot',
reader_cfg=reader_cfg, reader_cfg=reader_cfg,
infer_cfg=infer_cfg, infer_cfg=infer_cfg,

View File

@ -127,6 +127,7 @@ from .strategyqa import * # noqa: F401, F403
from .subjective import * # noqa: F401, F403 from .subjective import * # noqa: F401, F403
from .summedits import * # noqa: F401, F403 from .summedits import * # noqa: F401, F403
from .summscreen import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403
from .supergpqa import * # noqa: F401, F403
from .svamp import * # noqa: F401, F403 from .svamp import * # noqa: F401, F403
from .tabmwp import * # noqa: F401, F403 from .tabmwp import * # noqa: F401, F403
from .taco import * # noqa: F401, F403 from .taco import * # noqa: F401, F403
@ -147,4 +148,3 @@ from .xcopa import * # noqa: F401, F403
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403 from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
from .xlsum import * # noqa: F401, F403 from .xlsum import * # noqa: F401, F403
from .xsum import * # noqa: F401, F403 from .xsum import * # noqa: F401, F403
from .supergpqa import *

View File

@ -1,38 +1,23 @@
import csv
import json
import os.path as osp
from os import environ
from datasets import load_dataset
import os import os
from datasets import Dataset, DatasetDict
from opencompass.datasets.supergpqa.supergpqa_utils import ( from datasets import Dataset, load_dataset
evaluate_responses,
find_file, from opencompass.datasets.supergpqa.supergpqa_eval import (
load_json_or_jsonl, extract_option_content, extract_option_labels)
load_json_or_jsonl_with_idx, from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
load_yaml,
)
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
import unittest
from opencompass.utils import get_data_path from opencompass.utils import get_data_path
from opencompass.datasets.supergpqa.supergpqa_eval import (
extract_option_labels,
extract_option_content,
)
from ..base import BaseDataset from ..base import BaseDataset
def _parse(item, template, prompt_mode): def _parse(item, template, prompt_mode):
prompt_format = [ prompt_format = [
item['question'] item['question'] + '\n' + '\n'.join([
+ '\n' f'{chr(65+i)}) {option}'
+ '\n'.join( for i, option in enumerate(item['options'])
[ ])
f'{chr(65+i)}) {option}'
for i, option in enumerate(item['options'])
]
)
] ]
item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format) item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
item['prompt_mode'] = prompt_mode item['prompt_mode'] = prompt_mode
@ -41,6 +26,7 @@ def _parse(item, template, prompt_mode):
@LOAD_DATASET.register_module() @LOAD_DATASET.register_module()
class SuperGPQADataset(BaseDataset): class SuperGPQADataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str, prompt_mode: str, **kwargs): def load(path: str, prompt_mode: str, **kwargs):
path = get_data_path(path, local_mode=True) path = get_data_path(path, local_mode=True)
@ -80,126 +66,119 @@ class SuperGPQAEvaluator(BaseEvaluator):
count = 0 count = 0
err = 0 err = 0
miss = 0 miss = 0
acc_difficulty = {"hard": 0, "middle": 0, "easy": 0} acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
count_difficulty = {"hard": 0, "middle": 0, "easy": 0} count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
stats = {'discipline': {}, 'field': {}, 'subfield': {}} stats = {'discipline': {}, 'field': {}, 'subfield': {}}
details = [] details = []
for i, sample in enumerate(test_set): for i, sample in enumerate(test_set):
sample["pred"] = prediction = predictions[i] sample['pred'] = prediction = predictions[i]
gold = references[i] gold = references[i]
if mode == 'zero-shot': if mode == 'zero-shot':
predict = extract_option_labels(prediction, 'ABCDEFGHIJ') predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
if predict == None: if predict is None:
predict = extract_option_content( predict = extract_option_content(prediction,
prediction, sample["options"] sample['options'])
) predict = (chr(sample['options'].index(predict) +
predict = ( 65) if predict else None)
chr(sample["options"].index(predict) + 65) sample['extracted_answer'] = predict
if predict
else None
)
sample["extracted_answer"] = predict
elif mode == 'five-shot': elif mode == 'five-shot':
response = prediction.split('Question:')[0] response = prediction.split('Question:')[0]
predict = extract_option_labels(response, 'ABCDEFGHIJ') predict = extract_option_labels(response, 'ABCDEFGHIJ')
if predict == None: if predict is None:
predict = extract_option_content( predict = extract_option_content(response,
response, sample["options"] sample['options'])
) predict = (chr(sample['options'].index(predict) +
predict = ( 65) if predict else None)
chr(sample["options"].index(predict) + 65) if predict is None:
if predict
else None
)
if predict == None:
predict = extract_option_labels(prediction, 'ABCDEFGHIJ') predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
if predict == None: if predict is None:
predict = extract_option_content( predict = extract_option_content(
prediction, sample["options"] prediction, sample['options'])
) predict = (chr(sample['options'].index(predict) +
predict = ( 65) if predict else None)
chr(sample["options"].index(predict) + 65) sample['extracted_answer'] = predict
if predict
else None
)
sample["extracted_answer"] = predict
discipline = sample.get("discipline", "unknown") discipline = sample.get('discipline', 'unknown')
field = sample.get("field", "unknown") field = sample.get('field', 'unknown')
subfield = sample.get("subfield", "unknown") subfield = sample.get('subfield', 'unknown')
difficulty = sample.get("difficulty", "unknown") difficulty = sample.get('difficulty', 'unknown')
for level, key in [ for level, key in [
('discipline', discipline), ('discipline', discipline),
# ('field', f"{discipline}/{field}"), # ('field', f"{discipline}/{field}"),
# ('subfield', f"{discipline}/{field}/{subfield}"), # ('subfield', f"{discipline}/{field}/{subfield}"),
]: ]:
if key not in stats[level]: if key not in stats[level]:
stats[level][key] = { stats[level][key] = {
"correct": 0, 'correct': 0,
"total": 0, 'total': 0,
"miss": 0, 'miss': 0,
"error": 0, 'error': 0,
"discipline": discipline, 'discipline': discipline,
"field": field, 'field': field,
"subfield": subfield, 'subfield': subfield,
"difficulty": { 'difficulty': {
"easy": {"correct": 0, "total": 0}, 'easy': {
"middle": {"correct": 0, "total": 0}, 'correct': 0,
"hard": {"correct": 0, "total": 0}, 'total': 0
},
'middle': {
'correct': 0,
'total': 0
},
'hard': {
'correct': 0,
'total': 0
},
}, },
} }
stats[level][key]["total"] += 1 stats[level][key]['total'] += 1
stats[level][key]["difficulty"][difficulty]["total"] += 1 stats[level][key]['difficulty'][difficulty]['total'] += 1
answer_letter = sample["answer_letter"] answer_letter = sample['answer_letter']
assert answer_letter == gold assert answer_letter == gold
if predict and answer_letter == predict: if predict and answer_letter == predict:
acc += 1 acc += 1
acc_difficulty[difficulty] += 1 acc_difficulty[difficulty] += 1
sample["status"] = "correct" sample['status'] = 'correct'
stats[level][key]["correct"] += 1 stats[level][key]['correct'] += 1
stats[level][key]["difficulty"][difficulty]["correct"] += 1 stats[level][key]['difficulty'][difficulty]['correct'] += 1
elif predict == None or predict == "": elif predict == None or predict == '':
miss += 1 miss += 1
sample["status"] = "miss" sample['status'] = 'miss'
stats[level][key]["miss"] += 1 stats[level][key]['miss'] += 1
elif predict == 'error': elif predict == 'error':
err += 1 err += 1
sample["status"] = "error" sample['status'] = 'error'
stats[level][key]["error"] += 1 stats[level][key]['error'] += 1
else: else:
sample["status"] = "incorrect" sample['status'] = 'incorrect'
count += 1 count += 1
count_difficulty[difficulty] += 1 count_difficulty[difficulty] += 1
details.append( details.append({
{ 'pred': sample['pred'],
'pred': sample['pred'], 'answer': sample['answer'],
'answer': sample['answer'], 'parsed_answer': sample['extracted_answer'],
'parsed_answer': sample['extracted_answer'], 'correct': True if sample['status'] else False,
'correct': True if sample['status'] else False, })
}
)
return { return {
'accuracy': acc / count if count > 0 else 0, 'accuracy':
'error_rate': err / count if count > 0 else 0, acc / count if count > 0 else 0,
'miss_rate': miss / count if count > 0 else 0, 'error_rate':
'hard_accuracy': ( err / count if count > 0 else 0,
acc_difficulty["hard"] / count_difficulty["hard"] 'miss_rate':
if count_difficulty["hard"] > 0 miss / count if count > 0 else 0,
else 0 'hard_accuracy':
), (acc_difficulty['hard'] /
'middle_accuracy': ( count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0),
acc_difficulty["middle"] / count_difficulty["middle"] 'middle_accuracy':
if count_difficulty["middle"] > 0 (acc_difficulty['middle'] / count_difficulty['middle']
else 0 if count_difficulty['middle'] > 0 else 0),
), 'easy_accuracy':
'easy_accuracy': ( (acc_difficulty['easy'] /
acc_difficulty["easy"] / count_difficulty["easy"] count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0),
if count_difficulty["easy"] > 0 'details':
else 0 details,
),
'details': details,
} }

View File

@ -1,7 +1,8 @@
import yaml import yaml
import uuid
class ConfigWrapper: class ConfigWrapper:
def __init__(self, config_path): def __init__(self, config_path):
self._config = {} self._config = {}
with open(config_path, 'r') as file: with open(config_path, 'r') as file:
@ -15,37 +16,73 @@ class ConfigWrapper:
else: else:
self._config[key] = value self._config[key] = value
super().__setattr__(key, value) super().__setattr__(key, value)
def __getattr__(self, key): def __getattr__(self, key):
if key in self._config: if key in self._config:
return self._config[key] return self._config[key]
raise AttributeError(f"'ConfigWrapper' object has no attribute '{key}'") raise AttributeError(
f"'ConfigWrapper' object has no attribute '{key}'")
def get_id(self, data): def get_id(self, data):
if isinstance(self._config.get('id_key'), str): if isinstance(self._config.get('id_key'), str):
return data.get(self._config.get('id_key'), None) return data.get(self._config.get('id_key'), None)
elif isinstance(self._config.get('id_key'), list): elif isinstance(self._config.get('id_key'), list):
return '_'.join([str(data[key]) for key in self._config.get('id_key') if key in data]) return '_'.join([
str(data[key]) for key in self._config.get('id_key')
if key in data
])
def print_all_keys(self): def print_all_keys(self):
print("config keys:") print('config keys:')
for key, value in self._config.items(): for key, value in self._config.items():
print(f" - {key}: {value}") print(f' - {key}: {value}')
config_wrapper = None config_wrapper = None
def initialize_config(config_path): def initialize_config(config_path):
global config_wrapper global config_wrapper
config_wrapper = ConfigWrapper(config_path) config_wrapper = ConfigWrapper(config_path)
def get_config_wrapper(): def get_config_wrapper():
global config_wrapper global config_wrapper
if config_wrapper is None: if config_wrapper is None:
raise RuntimeError("ConfigWrapper not initialized. Call initialize_config first.") raise RuntimeError(
'ConfigWrapper not initialized. Call initialize_config first.')
return config_wrapper return config_wrapper
if __name__ == '__main__': if __name__ == '__main__':
config_path = 'config/config.yaml' config_path = 'config/config.yaml'
initialize_config(config_path) initialize_config(config_path)
data = {'idx': '50', 'step':21, 'question': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\nPlease provide the decrypted answer, encapsulated in double square brackets. For example, the format should be: [[decrypted answer]].', 'answer': '[[P]]', 'category': 'Decryption', 'rule_id': '23', 'input': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', 'steps_num': 23, 'description': 'For a number c=228 in the ciphertext:\nCalculate z = c^e mod n. Here ^ means multiplication.\nz is 80.\nBased on the decimal number represented by z, use the ascii code to find the corresponding letter as the plaintext letter p.\nPlease give the letter p in [[...]] format.\n', 'atom': 80} data = {
print(config_wrapper.get_id(data)) 'idx':
'50',
'step':
21,
'question':
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
'Please provide the decrypted answer, encapsulated in double square'
' brackets. For example, the format should be: [[decrypted answer]].',
'answer':
'[[P]]',
'category':
'Decryption',
'rule_id':
'23',
'input':
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
'steps_num':
23,
'description':
'For a number c=228 in the ciphertext:\n'
'Calculate z = c^e mod n. Here ^ means multiplication.\nz is 80.'
'\nBased on the decimal number represented by z, use the ascii '
'code to find the corresponding letter as the plaintext letter p.'
'\nPlease give the letter p in [[...]] format.\n',
'atom':
80,
}
print(config_wrapper.get_id(data))

View File

@ -1,36 +1,31 @@
import json # flake8: noqa: W605
import re import re
import argparse
import os
from prettytable import PrettyTable
import pandas as pd
from tqdm import tqdm
import timeout_decorator import timeout_decorator
import multiprocessing
import time
from functools import partial
@timeout_decorator.timeout(5) # 5 seconds timeout @timeout_decorator.timeout(5) # 5 seconds timeout
def safe_regex_search(pattern, text, flags=0): def safe_regex_search(pattern, text, flags=0):
try: try:
return re.search(pattern, text, flags) return re.search(pattern, text, flags)
except timeout_decorator.TimeoutError: except timeout_decorator.TimeoutError:
print(f"Regex match timeout: pattern={pattern}, text={text[:100]}...") print(f'Regex match timeout: pattern={pattern}, text={text[:100]}...')
return None return None
except Exception as e: except Exception as e:
print(f"Regex match error: {str(e)}") print(f'Regex match error: {str(e)}')
return None return None
def extract_option_labels(text, options='ABCDEFGHIJ'): def extract_option_labels(text, options='ABCDEFGHIJ'):
if not isinstance(text, str) or not isinstance(options, str): if not isinstance(text, str) or not isinstance(options, str):
return 'error' return 'error'
text = text.rstrip() text = text.rstrip()
last_line = text.split('\n')[-1] last_line = text.split('\n')[-1]
option_str = ''.join([chr(65 + i) for i in range(len(options))]) if options else 'ABCDEFGHIJ' option_str = ''.join([chr(65 + i) for i in range(len(options))
]) if options else 'ABCDEFGHIJ'
patterns = [ patterns = [
# e.g. "The final answer to this question is: A." # e.g. "The final answer to this question is: A."
# "The best option is $\boxed{B}:" # "The best option is $\boxed{B}:"
@ -41,7 +36,7 @@ def extract_option_labels(text, options='ABCDEFGHIJ'):
# "Answer: $\boxed{B}." # "Answer: $\boxed{B}."
# "ANSWER: (C):" # "ANSWER: (C):"
f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
# e.g. "A" # e.g. "A"
# "$\boxed{B}$" # "$\boxed{B}$"
# "(C)." # "(C)."
@ -53,46 +48,49 @@ def extract_option_labels(text, options='ABCDEFGHIJ'):
match = safe_regex_search(pattern, last_line, re.IGNORECASE) match = safe_regex_search(pattern, last_line, re.IGNORECASE)
if match: if match:
return match.group(1) return match.group(1)
for pattern in patterns: for pattern in patterns:
match = safe_regex_search(pattern, text, re.IGNORECASE) match = safe_regex_search(pattern, text, re.IGNORECASE)
if match: if match:
return match.group(1) return match.group(1)
return None return None
def extract_option_content(text, options_content=None): def extract_option_content(text, options_content=None):
if not isinstance(text, str) or not isinstance(options_content, list): if not isinstance(text, str) or not isinstance(options_content, list):
return 'error' return 'error'
escaped_options_content = [re.escape(option_content) for option_content in options_content] escaped_options_content = [
re.escape(option_content) for option_content in options_content
]
escaped_options_content_str = '|'.join(escaped_options_content) escaped_options_content_str = '|'.join(escaped_options_content)
text = text.rstrip() text = text.rstrip()
last_line = text.split('\n')[-1] last_line = text.split('\n')[-1]
patterns = [ patterns = [
f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
] ]
for pattern in patterns: for pattern in patterns:
match = safe_regex_search(pattern, last_line) match = safe_regex_search(pattern, last_line)
if match: if match:
if match.group(1) in escaped_options_content: if match.group(1) in escaped_options_content:
return options_content[escaped_options_content.index(match.group(1))] return options_content[escaped_options_content.index(
match.group(1))]
else: else:
return match.group(1) return match.group(1)
for pattern in patterns: for pattern in patterns:
match = safe_regex_search(pattern, text) match = safe_regex_search(pattern, text)
if match: if match:
if match.group(1) in escaped_options_content: if match.group(1) in escaped_options_content:
return options_content[escaped_options_content.index(match.group(1))] return options_content[escaped_options_content.index(
match.group(1))]
else: else:
return match.group(1) return match.group(1)
return None return None

View File

@ -6,6 +6,7 @@ import sympy as sp
import yaml import yaml
from sympy.parsing.latex import parse_latex from sympy.parsing.latex import parse_latex
def load_yaml(yaml_path): def load_yaml(yaml_path):
"""Load a YAML file.""" """Load a YAML file."""
if not os.path.exists(yaml_path): if not os.path.exists(yaml_path):
@ -670,8 +671,7 @@ def evaluate_responses(data, mode, base_path=None):
answer = record.get('gold', '') answer = record.get('gold', '')
rule_id = record.get('rule_id', '') rule_id = record.get('rule_id', '')
is_correct = evaluate_response_vs_answer(response, answer, is_correct = evaluate_response_vs_answer(response, answer,
question_type, rule_id, question_type, rule_id, idx)
idx)
result_dict = { result_dict = {
'idx': idx, 'idx': idx,
'response': response, 'response': response,
@ -681,8 +681,10 @@ def evaluate_responses(data, mode, base_path=None):
} }
if question_type == 'counterfactual': if question_type == 'counterfactual':
real_life_answer = record.get('real_life_answer', '') real_life_answer = record.get('real_life_answer', '')
is_real_life = evaluate_response_vs_answer( is_real_life = evaluate_response_vs_answer(response,
response, real_life_answer, question_type, rule_id, idx) real_life_answer,
question_type, rule_id,
idx)
result_dict['real_life_answer'] = real_life_answer result_dict['real_life_answer'] = real_life_answer
result_dict['is_real_life'] = is_real_life result_dict['is_real_life'] = is_real_life
if question_type == 'cipher' and mode == 'subquestions': if question_type == 'cipher' and mode == 'subquestions':

View File

@ -47,9 +47,8 @@ class BaseEvaluator:
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200 # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
return self._out_dir return self._out_dir
def group( def group(self, n: int, details: List[Dict[str, Any]],
self, n: int, details: List[Dict[str, Any]], test_set: Dataset test_set: Dataset) -> Dict[str, Any]:
) -> Dict[str, Any]:
example2replications = {} example2replications = {}
for detail, example in zip(details, test_set): for detail, example in zip(details, test_set):
example_abbr = f"{example['subdivision']}_{example['idx']}" example_abbr = f"{example['subdivision']}_{example['idx']}"
@ -64,28 +63,23 @@ class BaseEvaluator:
def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]: def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]:
g_passk_details = OrderedDict() g_passk_details = OrderedDict()
all_subdivisions = set( all_subdivisions = set(
[detail['example_abbr'].split('_')[0] for detail in details] [detail['example_abbr'].split('_')[0] for detail in details])
)
all_metrics = list(details[0].keys()) all_metrics = list(details[0].keys())
for subdivision in sorted(list(all_subdivisions)): for subdivision in sorted(list(all_subdivisions)):
for metric in all_metrics: for metric in all_metrics:
if metric in ['predictions', 'example_abbr']: if metric in ['predictions', 'example_abbr']:
continue continue
g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean( g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean([
[ detail[metric] for detail in details
detail[metric] if detail['example_abbr'].split('_')[0] == subdivision
for detail in details ])
if detail['example_abbr'].split('_')[0] == subdivision
]
)
for metric in all_metrics: for metric in all_metrics:
if metric in ['predictions', 'example_abbr']: if metric in ['predictions', 'example_abbr']:
continue continue
g_passk_details[metric] = 100.0 * np.mean( g_passk_details[metric] = 100.0 * np.mean(
[detail[metric] for detail in details] [detail[metric] for detail in details])
)
return g_passk_details return g_passk_details
def evaluate( def evaluate(
@ -104,7 +98,7 @@ class BaseEvaluator:
if isinstance(x, Dataset): if isinstance(x, Dataset):
return x.select(range(i * real_size, (i + 1) * real_size)) return x.select(range(i * real_size, (i + 1) * real_size))
elif isinstance(x, Iterable): elif isinstance(x, Iterable):
return x[i * real_size : (i + 1) * real_size] return x[i * real_size:(i + 1) * real_size]
else: else:
return x return x
@ -112,8 +106,7 @@ class BaseEvaluator:
**{ **{
key: select_fn(i, real_size, value) key: select_fn(i, real_size, value)
for key, value in score_kwargs.items() for key, value in score_kwargs.items()
} })
)
details = results.pop('details', None) details = results.pop('details', None)
if details is not None: if details is not None:
if isinstance(details, Dict): if isinstance(details, Dict):
@ -129,12 +122,10 @@ class BaseEvaluator:
eval_results[key].append(single_results[key]) eval_results[key].append(single_results[key])
for key in deepcopy(eval_results): for key in deepcopy(eval_results):
if isinstance(eval_results[key][0], float) or isinstance( if isinstance(eval_results[key][0], float) or isinstance(
eval_results[key][0], int eval_results[key][0], int):
):
if n > 1: if n > 1:
eval_results[key + f' ({n} runs average)'] = np.mean( eval_results[key + f' ({n} runs average)'] = np.mean(
eval_results[key] eval_results[key])
)
eval_results.pop(key) eval_results.pop(key)
else: else:
eval_results[key] = np.mean(eval_results[key]) eval_results[key] = np.mean(eval_results[key])
@ -163,22 +154,23 @@ class BaseEvaluator:
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0] thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
for _k in [k] if isinstance(k, int) else k: for _k in [k] if isinstance(k, int) else k:
for threshold in thresholds: for threshold in thresholds:
g_pass = compute_g_pass_at_k( g_pass = compute_g_pass_at_k(n=n,
n=n, c=c, k=_k, t=threshold c=c,
) k=_k,
t=threshold)
detail[f'G-Pass@{_k}_{threshold}'] = g_pass detail[f'G-Pass@{_k}_{threshold}'] = g_pass
detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k( detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k(n=n,
n=n, c=c, k=_k c=c,
) k=_k)
eval_details.append(detail) eval_details.append(detail)
if can_calculate and n > 1 and k > 1: if can_calculate and n > 1 and k > 1:
eval_results.update(self.reduce(eval_details)) eval_results.update(self.reduce(eval_details))
# Store eval_details in eval_results # Store eval_details in eval_results
eval_results['details'] = eval_details eval_results['details'] = eval_details
# Process details to flatten the predictions # Process details to flatten the predictions
for detail in eval_details: for detail in eval_details:
# Extract all prediction fields and flatten them # Extract all prediction fields and flatten them
@ -189,16 +181,17 @@ class BaseEvaluator:
flattened_predictions[k] = [v] flattened_predictions[k] = [v]
else: else:
flattened_predictions[k].append(v) flattened_predictions[k].append(v)
# Replace the predictions list with the flattened dictionary # Replace the predictions list with the flattened dictionary
for k, v in flattened_predictions.items(): for k, v in flattened_predictions.items():
detail[k] = v detail[k] = v
# Remove the original predictions field # Remove the original predictions field
detail.pop('predictions') detail.pop('predictions')
import ipdb; ipdb.set_trace() import ipdb
ipdb.set_trace()
return eval_results return eval_results
# If there are no details, return an empty dictionary # If there are no details, return an empty dictionary
return {} return {}