mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix lint
This commit is contained in:
parent
7938f352d7
commit
7f31ef7357
@ -57,7 +57,7 @@
|
|||||||
|
|
||||||
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||||
|
|
||||||
- **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测,欢迎尝试🔥🔥🔥
|
- **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测,欢迎尝试!🔥🔥🔥
|
||||||
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
|
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
|
||||||
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
|
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
|
||||||
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
|
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
|
||||||
|
@ -11,13 +11,13 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
|||||||
reader_cfg = dict(
|
reader_cfg = dict(
|
||||||
input_columns=[
|
input_columns=[
|
||||||
'question',
|
'question',
|
||||||
"options",
|
'options',
|
||||||
'discipline',
|
'discipline',
|
||||||
'field',
|
'field',
|
||||||
'subfield',
|
'subfield',
|
||||||
'difficulty',
|
'difficulty',
|
||||||
"infer_prompt",
|
'infer_prompt',
|
||||||
"prompt_mode",
|
'prompt_mode',
|
||||||
],
|
],
|
||||||
output_column='answer_letter',
|
output_column='answer_letter',
|
||||||
)
|
)
|
||||||
@ -47,7 +47,7 @@ eval_cfg = dict(
|
|||||||
supergpqa_dataset = dict(
|
supergpqa_dataset = dict(
|
||||||
type=SuperGPQADataset,
|
type=SuperGPQADataset,
|
||||||
abbr='supergpqa',
|
abbr='supergpqa',
|
||||||
path="m-a-p/SuperGPQA",
|
path='m-a-p/SuperGPQA',
|
||||||
prompt_mode='zero-shot',
|
prompt_mode='zero-shot',
|
||||||
reader_cfg=reader_cfg,
|
reader_cfg=reader_cfg,
|
||||||
infer_cfg=infer_cfg,
|
infer_cfg=infer_cfg,
|
||||||
|
@ -127,6 +127,7 @@ from .strategyqa import * # noqa: F401, F403
|
|||||||
from .subjective import * # noqa: F401, F403
|
from .subjective import * # noqa: F401, F403
|
||||||
from .summedits import * # noqa: F401, F403
|
from .summedits import * # noqa: F401, F403
|
||||||
from .summscreen import * # noqa: F401, F403
|
from .summscreen import * # noqa: F401, F403
|
||||||
|
from .supergpqa import * # noqa: F401, F403
|
||||||
from .svamp import * # noqa: F401, F403
|
from .svamp import * # noqa: F401, F403
|
||||||
from .tabmwp import * # noqa: F401, F403
|
from .tabmwp import * # noqa: F401, F403
|
||||||
from .taco import * # noqa: F401, F403
|
from .taco import * # noqa: F401, F403
|
||||||
@ -147,4 +148,3 @@ from .xcopa import * # noqa: F401, F403
|
|||||||
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
|
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
|
||||||
from .xlsum import * # noqa: F401, F403
|
from .xlsum import * # noqa: F401, F403
|
||||||
from .xsum import * # noqa: F401, F403
|
from .xsum import * # noqa: F401, F403
|
||||||
from .supergpqa import *
|
|
@ -1,38 +1,23 @@
|
|||||||
import csv
|
|
||||||
import json
|
|
||||||
import os.path as osp
|
|
||||||
from os import environ
|
|
||||||
from datasets import load_dataset
|
|
||||||
import os
|
import os
|
||||||
from datasets import Dataset, DatasetDict
|
|
||||||
from opencompass.datasets.supergpqa.supergpqa_utils import (
|
from datasets import Dataset, load_dataset
|
||||||
evaluate_responses,
|
|
||||||
find_file,
|
from opencompass.datasets.supergpqa.supergpqa_eval import (
|
||||||
load_json_or_jsonl,
|
extract_option_content, extract_option_labels)
|
||||||
load_json_or_jsonl_with_idx,
|
from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
|
||||||
load_yaml,
|
|
||||||
)
|
|
||||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||||
import unittest
|
|
||||||
from opencompass.utils import get_data_path
|
from opencompass.utils import get_data_path
|
||||||
from opencompass.datasets.supergpqa.supergpqa_eval import (
|
|
||||||
extract_option_labels,
|
|
||||||
extract_option_content,
|
|
||||||
)
|
|
||||||
from ..base import BaseDataset
|
from ..base import BaseDataset
|
||||||
|
|
||||||
|
|
||||||
def _parse(item, template, prompt_mode):
|
def _parse(item, template, prompt_mode):
|
||||||
prompt_format = [
|
prompt_format = [
|
||||||
item['question']
|
item['question'] + '\n' + '\n'.join([
|
||||||
+ '\n'
|
f'{chr(65+i)}) {option}'
|
||||||
+ '\n'.join(
|
for i, option in enumerate(item['options'])
|
||||||
[
|
])
|
||||||
f'{chr(65+i)}) {option}'
|
|
||||||
for i, option in enumerate(item['options'])
|
|
||||||
]
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
|
item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
|
||||||
item['prompt_mode'] = prompt_mode
|
item['prompt_mode'] = prompt_mode
|
||||||
@ -41,6 +26,7 @@ def _parse(item, template, prompt_mode):
|
|||||||
|
|
||||||
@LOAD_DATASET.register_module()
|
@LOAD_DATASET.register_module()
|
||||||
class SuperGPQADataset(BaseDataset):
|
class SuperGPQADataset(BaseDataset):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path: str, prompt_mode: str, **kwargs):
|
def load(path: str, prompt_mode: str, **kwargs):
|
||||||
path = get_data_path(path, local_mode=True)
|
path = get_data_path(path, local_mode=True)
|
||||||
@ -80,126 +66,119 @@ class SuperGPQAEvaluator(BaseEvaluator):
|
|||||||
count = 0
|
count = 0
|
||||||
err = 0
|
err = 0
|
||||||
miss = 0
|
miss = 0
|
||||||
acc_difficulty = {"hard": 0, "middle": 0, "easy": 0}
|
acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
|
||||||
count_difficulty = {"hard": 0, "middle": 0, "easy": 0}
|
count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
|
||||||
stats = {'discipline': {}, 'field': {}, 'subfield': {}}
|
stats = {'discipline': {}, 'field': {}, 'subfield': {}}
|
||||||
details = []
|
details = []
|
||||||
for i, sample in enumerate(test_set):
|
for i, sample in enumerate(test_set):
|
||||||
sample["pred"] = prediction = predictions[i]
|
sample['pred'] = prediction = predictions[i]
|
||||||
gold = references[i]
|
gold = references[i]
|
||||||
if mode == 'zero-shot':
|
if mode == 'zero-shot':
|
||||||
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
|
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
|
||||||
if predict == None:
|
if predict is None:
|
||||||
predict = extract_option_content(
|
predict = extract_option_content(prediction,
|
||||||
prediction, sample["options"]
|
sample['options'])
|
||||||
)
|
predict = (chr(sample['options'].index(predict) +
|
||||||
predict = (
|
65) if predict else None)
|
||||||
chr(sample["options"].index(predict) + 65)
|
sample['extracted_answer'] = predict
|
||||||
if predict
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
sample["extracted_answer"] = predict
|
|
||||||
elif mode == 'five-shot':
|
elif mode == 'five-shot':
|
||||||
response = prediction.split('Question:')[0]
|
response = prediction.split('Question:')[0]
|
||||||
predict = extract_option_labels(response, 'ABCDEFGHIJ')
|
predict = extract_option_labels(response, 'ABCDEFGHIJ')
|
||||||
if predict == None:
|
if predict is None:
|
||||||
predict = extract_option_content(
|
predict = extract_option_content(response,
|
||||||
response, sample["options"]
|
sample['options'])
|
||||||
)
|
predict = (chr(sample['options'].index(predict) +
|
||||||
predict = (
|
65) if predict else None)
|
||||||
chr(sample["options"].index(predict) + 65)
|
if predict is None:
|
||||||
if predict
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
if predict == None:
|
|
||||||
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
|
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
|
||||||
if predict == None:
|
if predict is None:
|
||||||
predict = extract_option_content(
|
predict = extract_option_content(
|
||||||
prediction, sample["options"]
|
prediction, sample['options'])
|
||||||
)
|
predict = (chr(sample['options'].index(predict) +
|
||||||
predict = (
|
65) if predict else None)
|
||||||
chr(sample["options"].index(predict) + 65)
|
sample['extracted_answer'] = predict
|
||||||
if predict
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
sample["extracted_answer"] = predict
|
|
||||||
|
|
||||||
discipline = sample.get("discipline", "unknown")
|
discipline = sample.get('discipline', 'unknown')
|
||||||
field = sample.get("field", "unknown")
|
field = sample.get('field', 'unknown')
|
||||||
subfield = sample.get("subfield", "unknown")
|
subfield = sample.get('subfield', 'unknown')
|
||||||
difficulty = sample.get("difficulty", "unknown")
|
difficulty = sample.get('difficulty', 'unknown')
|
||||||
|
|
||||||
for level, key in [
|
for level, key in [
|
||||||
('discipline', discipline),
|
('discipline', discipline),
|
||||||
# ('field', f"{discipline}/{field}"),
|
# ('field', f"{discipline}/{field}"),
|
||||||
# ('subfield', f"{discipline}/{field}/{subfield}"),
|
# ('subfield', f"{discipline}/{field}/{subfield}"),
|
||||||
]:
|
]:
|
||||||
if key not in stats[level]:
|
if key not in stats[level]:
|
||||||
stats[level][key] = {
|
stats[level][key] = {
|
||||||
"correct": 0,
|
'correct': 0,
|
||||||
"total": 0,
|
'total': 0,
|
||||||
"miss": 0,
|
'miss': 0,
|
||||||
"error": 0,
|
'error': 0,
|
||||||
"discipline": discipline,
|
'discipline': discipline,
|
||||||
"field": field,
|
'field': field,
|
||||||
"subfield": subfield,
|
'subfield': subfield,
|
||||||
"difficulty": {
|
'difficulty': {
|
||||||
"easy": {"correct": 0, "total": 0},
|
'easy': {
|
||||||
"middle": {"correct": 0, "total": 0},
|
'correct': 0,
|
||||||
"hard": {"correct": 0, "total": 0},
|
'total': 0
|
||||||
|
},
|
||||||
|
'middle': {
|
||||||
|
'correct': 0,
|
||||||
|
'total': 0
|
||||||
|
},
|
||||||
|
'hard': {
|
||||||
|
'correct': 0,
|
||||||
|
'total': 0
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
stats[level][key]["total"] += 1
|
stats[level][key]['total'] += 1
|
||||||
stats[level][key]["difficulty"][difficulty]["total"] += 1
|
stats[level][key]['difficulty'][difficulty]['total'] += 1
|
||||||
|
|
||||||
answer_letter = sample["answer_letter"]
|
answer_letter = sample['answer_letter']
|
||||||
assert answer_letter == gold
|
assert answer_letter == gold
|
||||||
if predict and answer_letter == predict:
|
if predict and answer_letter == predict:
|
||||||
acc += 1
|
acc += 1
|
||||||
acc_difficulty[difficulty] += 1
|
acc_difficulty[difficulty] += 1
|
||||||
sample["status"] = "correct"
|
sample['status'] = 'correct'
|
||||||
stats[level][key]["correct"] += 1
|
stats[level][key]['correct'] += 1
|
||||||
stats[level][key]["difficulty"][difficulty]["correct"] += 1
|
stats[level][key]['difficulty'][difficulty]['correct'] += 1
|
||||||
elif predict == None or predict == "":
|
elif predict == None or predict == '':
|
||||||
miss += 1
|
miss += 1
|
||||||
sample["status"] = "miss"
|
sample['status'] = 'miss'
|
||||||
stats[level][key]["miss"] += 1
|
stats[level][key]['miss'] += 1
|
||||||
elif predict == 'error':
|
elif predict == 'error':
|
||||||
err += 1
|
err += 1
|
||||||
sample["status"] = "error"
|
sample['status'] = 'error'
|
||||||
stats[level][key]["error"] += 1
|
stats[level][key]['error'] += 1
|
||||||
else:
|
else:
|
||||||
sample["status"] = "incorrect"
|
sample['status'] = 'incorrect'
|
||||||
count += 1
|
count += 1
|
||||||
count_difficulty[difficulty] += 1
|
count_difficulty[difficulty] += 1
|
||||||
details.append(
|
details.append({
|
||||||
{
|
'pred': sample['pred'],
|
||||||
'pred': sample['pred'],
|
'answer': sample['answer'],
|
||||||
'answer': sample['answer'],
|
'parsed_answer': sample['extracted_answer'],
|
||||||
'parsed_answer': sample['extracted_answer'],
|
'correct': True if sample['status'] else False,
|
||||||
'correct': True if sample['status'] else False,
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'accuracy': acc / count if count > 0 else 0,
|
'accuracy':
|
||||||
'error_rate': err / count if count > 0 else 0,
|
acc / count if count > 0 else 0,
|
||||||
'miss_rate': miss / count if count > 0 else 0,
|
'error_rate':
|
||||||
'hard_accuracy': (
|
err / count if count > 0 else 0,
|
||||||
acc_difficulty["hard"] / count_difficulty["hard"]
|
'miss_rate':
|
||||||
if count_difficulty["hard"] > 0
|
miss / count if count > 0 else 0,
|
||||||
else 0
|
'hard_accuracy':
|
||||||
),
|
(acc_difficulty['hard'] /
|
||||||
'middle_accuracy': (
|
count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0),
|
||||||
acc_difficulty["middle"] / count_difficulty["middle"]
|
'middle_accuracy':
|
||||||
if count_difficulty["middle"] > 0
|
(acc_difficulty['middle'] / count_difficulty['middle']
|
||||||
else 0
|
if count_difficulty['middle'] > 0 else 0),
|
||||||
),
|
'easy_accuracy':
|
||||||
'easy_accuracy': (
|
(acc_difficulty['easy'] /
|
||||||
acc_difficulty["easy"] / count_difficulty["easy"]
|
count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0),
|
||||||
if count_difficulty["easy"] > 0
|
'details':
|
||||||
else 0
|
details,
|
||||||
),
|
|
||||||
'details': details,
|
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import yaml
|
import yaml
|
||||||
import uuid
|
|
||||||
|
|
||||||
class ConfigWrapper:
|
class ConfigWrapper:
|
||||||
|
|
||||||
def __init__(self, config_path):
|
def __init__(self, config_path):
|
||||||
self._config = {}
|
self._config = {}
|
||||||
with open(config_path, 'r') as file:
|
with open(config_path, 'r') as file:
|
||||||
@ -15,37 +16,73 @@ class ConfigWrapper:
|
|||||||
else:
|
else:
|
||||||
self._config[key] = value
|
self._config[key] = value
|
||||||
super().__setattr__(key, value)
|
super().__setattr__(key, value)
|
||||||
|
|
||||||
def __getattr__(self, key):
|
def __getattr__(self, key):
|
||||||
if key in self._config:
|
if key in self._config:
|
||||||
return self._config[key]
|
return self._config[key]
|
||||||
raise AttributeError(f"'ConfigWrapper' object has no attribute '{key}'")
|
raise AttributeError(
|
||||||
|
f"'ConfigWrapper' object has no attribute '{key}'")
|
||||||
|
|
||||||
def get_id(self, data):
|
def get_id(self, data):
|
||||||
if isinstance(self._config.get('id_key'), str):
|
if isinstance(self._config.get('id_key'), str):
|
||||||
return data.get(self._config.get('id_key'), None)
|
return data.get(self._config.get('id_key'), None)
|
||||||
elif isinstance(self._config.get('id_key'), list):
|
elif isinstance(self._config.get('id_key'), list):
|
||||||
return '_'.join([str(data[key]) for key in self._config.get('id_key') if key in data])
|
return '_'.join([
|
||||||
|
str(data[key]) for key in self._config.get('id_key')
|
||||||
|
if key in data
|
||||||
|
])
|
||||||
|
|
||||||
def print_all_keys(self):
|
def print_all_keys(self):
|
||||||
print("config keys:")
|
print('config keys:')
|
||||||
for key, value in self._config.items():
|
for key, value in self._config.items():
|
||||||
print(f" - {key}: {value}")
|
print(f' - {key}: {value}')
|
||||||
|
|
||||||
|
|
||||||
config_wrapper = None
|
config_wrapper = None
|
||||||
|
|
||||||
|
|
||||||
def initialize_config(config_path):
|
def initialize_config(config_path):
|
||||||
global config_wrapper
|
global config_wrapper
|
||||||
config_wrapper = ConfigWrapper(config_path)
|
config_wrapper = ConfigWrapper(config_path)
|
||||||
|
|
||||||
|
|
||||||
def get_config_wrapper():
|
def get_config_wrapper():
|
||||||
global config_wrapper
|
global config_wrapper
|
||||||
if config_wrapper is None:
|
if config_wrapper is None:
|
||||||
raise RuntimeError("ConfigWrapper not initialized. Call initialize_config first.")
|
raise RuntimeError(
|
||||||
|
'ConfigWrapper not initialized. Call initialize_config first.')
|
||||||
return config_wrapper
|
return config_wrapper
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
config_path = 'config/config.yaml'
|
config_path = 'config/config.yaml'
|
||||||
initialize_config(config_path)
|
initialize_config(config_path)
|
||||||
data = {'idx': '50', 'step':21, 'question': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\nPlease provide the decrypted answer, encapsulated in double square brackets. For example, the format should be: [[decrypted answer]].', 'answer': '[[P]]', 'category': 'Decryption', 'rule_id': '23', 'input': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', 'steps_num': 23, 'description': 'For a number c=228 in the ciphertext:\nCalculate z = c^e mod n. Here ^ means multiplication.\nz is 80.\nBased on the decimal number represented by z, use the ascii code to find the corresponding letter as the plaintext letter p.\nPlease give the letter p in [[...]] format.\n', 'atom': 80}
|
data = {
|
||||||
print(config_wrapper.get_id(data))
|
'idx':
|
||||||
|
'50',
|
||||||
|
'step':
|
||||||
|
21,
|
||||||
|
'question':
|
||||||
|
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
|
||||||
|
'Please provide the decrypted answer, encapsulated in double square'
|
||||||
|
' brackets. For example, the format should be: [[decrypted answer]].',
|
||||||
|
'answer':
|
||||||
|
'[[P]]',
|
||||||
|
'category':
|
||||||
|
'Decryption',
|
||||||
|
'rule_id':
|
||||||
|
'23',
|
||||||
|
'input':
|
||||||
|
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
|
||||||
|
'steps_num':
|
||||||
|
23,
|
||||||
|
'description':
|
||||||
|
'For a number c=228 in the ciphertext:\n'
|
||||||
|
'Calculate z = c^e mod n. Here ^ means multiplication.\nz is 80.'
|
||||||
|
'\nBased on the decimal number represented by z, use the ascii '
|
||||||
|
'code to find the corresponding letter as the plaintext letter p.'
|
||||||
|
'\nPlease give the letter p in [[...]] format.\n',
|
||||||
|
'atom':
|
||||||
|
80,
|
||||||
|
}
|
||||||
|
print(config_wrapper.get_id(data))
|
||||||
|
@ -1,36 +1,31 @@
|
|||||||
import json
|
# flake8: noqa: W605
|
||||||
import re
|
import re
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from prettytable import PrettyTable
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
import timeout_decorator
|
import timeout_decorator
|
||||||
import multiprocessing
|
|
||||||
import time
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
@timeout_decorator.timeout(5) # 5 seconds timeout
|
@timeout_decorator.timeout(5) # 5 seconds timeout
|
||||||
def safe_regex_search(pattern, text, flags=0):
|
def safe_regex_search(pattern, text, flags=0):
|
||||||
try:
|
try:
|
||||||
return re.search(pattern, text, flags)
|
return re.search(pattern, text, flags)
|
||||||
except timeout_decorator.TimeoutError:
|
except timeout_decorator.TimeoutError:
|
||||||
print(f"Regex match timeout: pattern={pattern}, text={text[:100]}...")
|
print(f'Regex match timeout: pattern={pattern}, text={text[:100]}...')
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Regex match error: {str(e)}")
|
print(f'Regex match error: {str(e)}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_option_labels(text, options='ABCDEFGHIJ'):
|
def extract_option_labels(text, options='ABCDEFGHIJ'):
|
||||||
if not isinstance(text, str) or not isinstance(options, str):
|
if not isinstance(text, str) or not isinstance(options, str):
|
||||||
return 'error'
|
return 'error'
|
||||||
|
|
||||||
text = text.rstrip()
|
text = text.rstrip()
|
||||||
last_line = text.split('\n')[-1]
|
last_line = text.split('\n')[-1]
|
||||||
|
|
||||||
option_str = ''.join([chr(65 + i) for i in range(len(options))]) if options else 'ABCDEFGHIJ'
|
option_str = ''.join([chr(65 + i) for i in range(len(options))
|
||||||
|
]) if options else 'ABCDEFGHIJ'
|
||||||
|
|
||||||
patterns = [
|
patterns = [
|
||||||
# e.g. "The final answer to this question is: A."
|
# e.g. "The final answer to this question is: A."
|
||||||
# "The best option is $\boxed{B}:"
|
# "The best option is $\boxed{B}:"
|
||||||
@ -41,7 +36,7 @@ def extract_option_labels(text, options='ABCDEFGHIJ'):
|
|||||||
# "Answer: $\boxed{B}."
|
# "Answer: $\boxed{B}."
|
||||||
# "ANSWER: (C):"
|
# "ANSWER: (C):"
|
||||||
f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
||||||
|
|
||||||
# e.g. "A"
|
# e.g. "A"
|
||||||
# "$\boxed{B}$"
|
# "$\boxed{B}$"
|
||||||
# "(C)."
|
# "(C)."
|
||||||
@ -53,46 +48,49 @@ def extract_option_labels(text, options='ABCDEFGHIJ'):
|
|||||||
match = safe_regex_search(pattern, last_line, re.IGNORECASE)
|
match = safe_regex_search(pattern, last_line, re.IGNORECASE)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
match = safe_regex_search(pattern, text, re.IGNORECASE)
|
match = safe_regex_search(pattern, text, re.IGNORECASE)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_option_content(text, options_content=None):
|
def extract_option_content(text, options_content=None):
|
||||||
if not isinstance(text, str) or not isinstance(options_content, list):
|
if not isinstance(text, str) or not isinstance(options_content, list):
|
||||||
return 'error'
|
return 'error'
|
||||||
|
|
||||||
escaped_options_content = [re.escape(option_content) for option_content in options_content]
|
escaped_options_content = [
|
||||||
|
re.escape(option_content) for option_content in options_content
|
||||||
|
]
|
||||||
escaped_options_content_str = '|'.join(escaped_options_content)
|
escaped_options_content_str = '|'.join(escaped_options_content)
|
||||||
|
|
||||||
text = text.rstrip()
|
text = text.rstrip()
|
||||||
last_line = text.split('\n')[-1]
|
last_line = text.split('\n')[-1]
|
||||||
|
|
||||||
patterns = [
|
patterns = [
|
||||||
f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
||||||
|
|
||||||
f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
||||||
|
|
||||||
f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
||||||
]
|
]
|
||||||
|
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
match = safe_regex_search(pattern, last_line)
|
match = safe_regex_search(pattern, last_line)
|
||||||
if match:
|
if match:
|
||||||
if match.group(1) in escaped_options_content:
|
if match.group(1) in escaped_options_content:
|
||||||
return options_content[escaped_options_content.index(match.group(1))]
|
return options_content[escaped_options_content.index(
|
||||||
|
match.group(1))]
|
||||||
else:
|
else:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
match = safe_regex_search(pattern, text)
|
match = safe_regex_search(pattern, text)
|
||||||
if match:
|
if match:
|
||||||
if match.group(1) in escaped_options_content:
|
if match.group(1) in escaped_options_content:
|
||||||
return options_content[escaped_options_content.index(match.group(1))]
|
return options_content[escaped_options_content.index(
|
||||||
|
match.group(1))]
|
||||||
else:
|
else:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
@ -6,6 +6,7 @@ import sympy as sp
|
|||||||
import yaml
|
import yaml
|
||||||
from sympy.parsing.latex import parse_latex
|
from sympy.parsing.latex import parse_latex
|
||||||
|
|
||||||
|
|
||||||
def load_yaml(yaml_path):
|
def load_yaml(yaml_path):
|
||||||
"""Load a YAML file."""
|
"""Load a YAML file."""
|
||||||
if not os.path.exists(yaml_path):
|
if not os.path.exists(yaml_path):
|
||||||
@ -670,8 +671,7 @@ def evaluate_responses(data, mode, base_path=None):
|
|||||||
answer = record.get('gold', '')
|
answer = record.get('gold', '')
|
||||||
rule_id = record.get('rule_id', '')
|
rule_id = record.get('rule_id', '')
|
||||||
is_correct = evaluate_response_vs_answer(response, answer,
|
is_correct = evaluate_response_vs_answer(response, answer,
|
||||||
question_type, rule_id,
|
question_type, rule_id, idx)
|
||||||
idx)
|
|
||||||
result_dict = {
|
result_dict = {
|
||||||
'idx': idx,
|
'idx': idx,
|
||||||
'response': response,
|
'response': response,
|
||||||
@ -681,8 +681,10 @@ def evaluate_responses(data, mode, base_path=None):
|
|||||||
}
|
}
|
||||||
if question_type == 'counterfactual':
|
if question_type == 'counterfactual':
|
||||||
real_life_answer = record.get('real_life_answer', '')
|
real_life_answer = record.get('real_life_answer', '')
|
||||||
is_real_life = evaluate_response_vs_answer(
|
is_real_life = evaluate_response_vs_answer(response,
|
||||||
response, real_life_answer, question_type, rule_id, idx)
|
real_life_answer,
|
||||||
|
question_type, rule_id,
|
||||||
|
idx)
|
||||||
result_dict['real_life_answer'] = real_life_answer
|
result_dict['real_life_answer'] = real_life_answer
|
||||||
result_dict['is_real_life'] = is_real_life
|
result_dict['is_real_life'] = is_real_life
|
||||||
if question_type == 'cipher' and mode == 'subquestions':
|
if question_type == 'cipher' and mode == 'subquestions':
|
||||||
|
@ -47,9 +47,8 @@ class BaseEvaluator:
|
|||||||
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
|
# please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
|
||||||
return self._out_dir
|
return self._out_dir
|
||||||
|
|
||||||
def group(
|
def group(self, n: int, details: List[Dict[str, Any]],
|
||||||
self, n: int, details: List[Dict[str, Any]], test_set: Dataset
|
test_set: Dataset) -> Dict[str, Any]:
|
||||||
) -> Dict[str, Any]:
|
|
||||||
example2replications = {}
|
example2replications = {}
|
||||||
for detail, example in zip(details, test_set):
|
for detail, example in zip(details, test_set):
|
||||||
example_abbr = f"{example['subdivision']}_{example['idx']}"
|
example_abbr = f"{example['subdivision']}_{example['idx']}"
|
||||||
@ -64,28 +63,23 @@ class BaseEvaluator:
|
|||||||
def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]:
|
def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
g_passk_details = OrderedDict()
|
g_passk_details = OrderedDict()
|
||||||
all_subdivisions = set(
|
all_subdivisions = set(
|
||||||
[detail['example_abbr'].split('_')[0] for detail in details]
|
[detail['example_abbr'].split('_')[0] for detail in details])
|
||||||
)
|
|
||||||
all_metrics = list(details[0].keys())
|
all_metrics = list(details[0].keys())
|
||||||
|
|
||||||
for subdivision in sorted(list(all_subdivisions)):
|
for subdivision in sorted(list(all_subdivisions)):
|
||||||
for metric in all_metrics:
|
for metric in all_metrics:
|
||||||
if metric in ['predictions', 'example_abbr']:
|
if metric in ['predictions', 'example_abbr']:
|
||||||
continue
|
continue
|
||||||
g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean(
|
g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean([
|
||||||
[
|
detail[metric] for detail in details
|
||||||
detail[metric]
|
if detail['example_abbr'].split('_')[0] == subdivision
|
||||||
for detail in details
|
])
|
||||||
if detail['example_abbr'].split('_')[0] == subdivision
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
for metric in all_metrics:
|
for metric in all_metrics:
|
||||||
if metric in ['predictions', 'example_abbr']:
|
if metric in ['predictions', 'example_abbr']:
|
||||||
continue
|
continue
|
||||||
g_passk_details[metric] = 100.0 * np.mean(
|
g_passk_details[metric] = 100.0 * np.mean(
|
||||||
[detail[metric] for detail in details]
|
[detail[metric] for detail in details])
|
||||||
)
|
|
||||||
return g_passk_details
|
return g_passk_details
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
@ -104,7 +98,7 @@ class BaseEvaluator:
|
|||||||
if isinstance(x, Dataset):
|
if isinstance(x, Dataset):
|
||||||
return x.select(range(i * real_size, (i + 1) * real_size))
|
return x.select(range(i * real_size, (i + 1) * real_size))
|
||||||
elif isinstance(x, Iterable):
|
elif isinstance(x, Iterable):
|
||||||
return x[i * real_size : (i + 1) * real_size]
|
return x[i * real_size:(i + 1) * real_size]
|
||||||
else:
|
else:
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@ -112,8 +106,7 @@ class BaseEvaluator:
|
|||||||
**{
|
**{
|
||||||
key: select_fn(i, real_size, value)
|
key: select_fn(i, real_size, value)
|
||||||
for key, value in score_kwargs.items()
|
for key, value in score_kwargs.items()
|
||||||
}
|
})
|
||||||
)
|
|
||||||
details = results.pop('details', None)
|
details = results.pop('details', None)
|
||||||
if details is not None:
|
if details is not None:
|
||||||
if isinstance(details, Dict):
|
if isinstance(details, Dict):
|
||||||
@ -129,12 +122,10 @@ class BaseEvaluator:
|
|||||||
eval_results[key].append(single_results[key])
|
eval_results[key].append(single_results[key])
|
||||||
for key in deepcopy(eval_results):
|
for key in deepcopy(eval_results):
|
||||||
if isinstance(eval_results[key][0], float) or isinstance(
|
if isinstance(eval_results[key][0], float) or isinstance(
|
||||||
eval_results[key][0], int
|
eval_results[key][0], int):
|
||||||
):
|
|
||||||
if n > 1:
|
if n > 1:
|
||||||
eval_results[key + f' ({n} runs average)'] = np.mean(
|
eval_results[key + f' ({n} runs average)'] = np.mean(
|
||||||
eval_results[key]
|
eval_results[key])
|
||||||
)
|
|
||||||
eval_results.pop(key)
|
eval_results.pop(key)
|
||||||
else:
|
else:
|
||||||
eval_results[key] = np.mean(eval_results[key])
|
eval_results[key] = np.mean(eval_results[key])
|
||||||
@ -163,22 +154,23 @@ class BaseEvaluator:
|
|||||||
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
|
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
|
||||||
for _k in [k] if isinstance(k, int) else k:
|
for _k in [k] if isinstance(k, int) else k:
|
||||||
for threshold in thresholds:
|
for threshold in thresholds:
|
||||||
g_pass = compute_g_pass_at_k(
|
g_pass = compute_g_pass_at_k(n=n,
|
||||||
n=n, c=c, k=_k, t=threshold
|
c=c,
|
||||||
)
|
k=_k,
|
||||||
|
t=threshold)
|
||||||
detail[f'G-Pass@{_k}_{threshold}'] = g_pass
|
detail[f'G-Pass@{_k}_{threshold}'] = g_pass
|
||||||
detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k(
|
detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k(n=n,
|
||||||
n=n, c=c, k=_k
|
c=c,
|
||||||
)
|
k=_k)
|
||||||
|
|
||||||
eval_details.append(detail)
|
eval_details.append(detail)
|
||||||
|
|
||||||
if can_calculate and n > 1 and k > 1:
|
if can_calculate and n > 1 and k > 1:
|
||||||
eval_results.update(self.reduce(eval_details))
|
eval_results.update(self.reduce(eval_details))
|
||||||
|
|
||||||
# Store eval_details in eval_results
|
# Store eval_details in eval_results
|
||||||
eval_results['details'] = eval_details
|
eval_results['details'] = eval_details
|
||||||
|
|
||||||
# Process details to flatten the predictions
|
# Process details to flatten the predictions
|
||||||
for detail in eval_details:
|
for detail in eval_details:
|
||||||
# Extract all prediction fields and flatten them
|
# Extract all prediction fields and flatten them
|
||||||
@ -189,16 +181,17 @@ class BaseEvaluator:
|
|||||||
flattened_predictions[k] = [v]
|
flattened_predictions[k] = [v]
|
||||||
else:
|
else:
|
||||||
flattened_predictions[k].append(v)
|
flattened_predictions[k].append(v)
|
||||||
|
|
||||||
# Replace the predictions list with the flattened dictionary
|
# Replace the predictions list with the flattened dictionary
|
||||||
for k, v in flattened_predictions.items():
|
for k, v in flattened_predictions.items():
|
||||||
detail[k] = v
|
detail[k] = v
|
||||||
|
|
||||||
# Remove the original predictions field
|
# Remove the original predictions field
|
||||||
detail.pop('predictions')
|
detail.pop('predictions')
|
||||||
import ipdb; ipdb.set_trace()
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
return eval_results
|
return eval_results
|
||||||
|
|
||||||
# If there are no details, return an empty dictionary
|
# If there are no details, return an empty dictionary
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user