diff --git a/README_zh-CN.md b/README_zh-CN.md index ca53e46c..3a451020 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,7 +57,7 @@ ## 🚀 最新进展 -- **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测,欢迎尝试🔥🔥🔥 +- **\[2025.03.11\]** 现已支持 `SuperGPQA` LLM知识能力评测,欢迎尝试!🔥🔥🔥 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥 - **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。 diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_gen.py b/opencompass/configs/datasets/supergpqa/supergpqa_gen.py index 64561e78..edf2e57e 100644 --- a/opencompass/configs/datasets/supergpqa/supergpqa_gen.py +++ b/opencompass/configs/datasets/supergpqa/supergpqa_gen.py @@ -11,13 +11,13 @@ from opencompass.openicl.icl_retriever import ZeroRetriever reader_cfg = dict( input_columns=[ 'question', - "options", + 'options', 'discipline', 'field', 'subfield', 'difficulty', - "infer_prompt", - "prompt_mode", + 'infer_prompt', + 'prompt_mode', ], output_column='answer_letter', ) @@ -47,7 +47,7 @@ eval_cfg = dict( supergpqa_dataset = dict( type=SuperGPQADataset, abbr='supergpqa', - path="m-a-p/SuperGPQA", + path='m-a-p/SuperGPQA', prompt_mode='zero-shot', reader_cfg=reader_cfg, infer_cfg=infer_cfg, diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 0e56a92d..ffcc217d 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -127,6 +127,7 @@ from .strategyqa import * # noqa: F401, F403 from .subjective import * # noqa: F401, F403 from .summedits import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403 +from .supergpqa import * # noqa: F401, F403 from .svamp import * # noqa: F401, F403 from .tabmwp import * # noqa: F401, F403 from .taco import * # noqa: F401, F403 @@ -147,4 +148,3 @@ from .xcopa import * # noqa: F401, F403 from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403 from .xlsum import * # noqa: F401, F403 from .xsum import * # noqa: F401, F403 -from .supergpqa import * \ No newline at end of file diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py index 9c912c1f..eabb2628 100644 --- a/opencompass/datasets/supergpqa/supergpqa.py +++ b/opencompass/datasets/supergpqa/supergpqa.py @@ -1,38 +1,23 @@ -import csv -import json -import os.path as osp -from os import environ -from datasets import load_dataset import os -from datasets import Dataset, DatasetDict -from opencompass.datasets.supergpqa.supergpqa_utils import ( - evaluate_responses, - find_file, - load_json_or_jsonl, - load_json_or_jsonl_with_idx, - load_yaml, -) + +from datasets import Dataset, load_dataset + +from opencompass.datasets.supergpqa.supergpqa_eval import ( + extract_option_content, extract_option_labels) +from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -import unittest from opencompass.utils import get_data_path -from opencompass.datasets.supergpqa.supergpqa_eval import ( - extract_option_labels, - extract_option_content, -) + from ..base import BaseDataset def _parse(item, template, prompt_mode): prompt_format = [ - item['question'] - + '\n' - + '\n'.join( - [ - f'{chr(65+i)}) {option}' - for i, option in enumerate(item['options']) - ] - ) + item['question'] + '\n' + '\n'.join([ + f'{chr(65+i)}) {option}' + for i, option in enumerate(item['options']) + ]) ] item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format) item['prompt_mode'] = prompt_mode @@ -41,6 +26,7 @@ def _parse(item, template, prompt_mode): @LOAD_DATASET.register_module() class SuperGPQADataset(BaseDataset): + @staticmethod def load(path: str, prompt_mode: str, **kwargs): path = get_data_path(path, local_mode=True) @@ -80,126 +66,119 @@ class SuperGPQAEvaluator(BaseEvaluator): count = 0 err = 0 miss = 0 - acc_difficulty = {"hard": 0, "middle": 0, "easy": 0} - count_difficulty = {"hard": 0, "middle": 0, "easy": 0} + acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0} + count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0} stats = {'discipline': {}, 'field': {}, 'subfield': {}} details = [] for i, sample in enumerate(test_set): - sample["pred"] = prediction = predictions[i] + sample['pred'] = prediction = predictions[i] gold = references[i] if mode == 'zero-shot': predict = extract_option_labels(prediction, 'ABCDEFGHIJ') - if predict == None: - predict = extract_option_content( - prediction, sample["options"] - ) - predict = ( - chr(sample["options"].index(predict) + 65) - if predict - else None - ) - sample["extracted_answer"] = predict + if predict is None: + predict = extract_option_content(prediction, + sample['options']) + predict = (chr(sample['options'].index(predict) + + 65) if predict else None) + sample['extracted_answer'] = predict elif mode == 'five-shot': response = prediction.split('Question:')[0] predict = extract_option_labels(response, 'ABCDEFGHIJ') - if predict == None: - predict = extract_option_content( - response, sample["options"] - ) - predict = ( - chr(sample["options"].index(predict) + 65) - if predict - else None - ) - if predict == None: + if predict is None: + predict = extract_option_content(response, + sample['options']) + predict = (chr(sample['options'].index(predict) + + 65) if predict else None) + if predict is None: predict = extract_option_labels(prediction, 'ABCDEFGHIJ') - if predict == None: + if predict is None: predict = extract_option_content( - prediction, sample["options"] - ) - predict = ( - chr(sample["options"].index(predict) + 65) - if predict - else None - ) - sample["extracted_answer"] = predict + prediction, sample['options']) + predict = (chr(sample['options'].index(predict) + + 65) if predict else None) + sample['extracted_answer'] = predict - discipline = sample.get("discipline", "unknown") - field = sample.get("field", "unknown") - subfield = sample.get("subfield", "unknown") - difficulty = sample.get("difficulty", "unknown") + discipline = sample.get('discipline', 'unknown') + field = sample.get('field', 'unknown') + subfield = sample.get('subfield', 'unknown') + difficulty = sample.get('difficulty', 'unknown') for level, key in [ ('discipline', discipline), - # ('field', f"{discipline}/{field}"), - # ('subfield', f"{discipline}/{field}/{subfield}"), + # ('field', f"{discipline}/{field}"), + # ('subfield', f"{discipline}/{field}/{subfield}"), ]: if key not in stats[level]: stats[level][key] = { - "correct": 0, - "total": 0, - "miss": 0, - "error": 0, - "discipline": discipline, - "field": field, - "subfield": subfield, - "difficulty": { - "easy": {"correct": 0, "total": 0}, - "middle": {"correct": 0, "total": 0}, - "hard": {"correct": 0, "total": 0}, + 'correct': 0, + 'total': 0, + 'miss': 0, + 'error': 0, + 'discipline': discipline, + 'field': field, + 'subfield': subfield, + 'difficulty': { + 'easy': { + 'correct': 0, + 'total': 0 + }, + 'middle': { + 'correct': 0, + 'total': 0 + }, + 'hard': { + 'correct': 0, + 'total': 0 + }, }, } - stats[level][key]["total"] += 1 - stats[level][key]["difficulty"][difficulty]["total"] += 1 + stats[level][key]['total'] += 1 + stats[level][key]['difficulty'][difficulty]['total'] += 1 - answer_letter = sample["answer_letter"] + answer_letter = sample['answer_letter'] assert answer_letter == gold if predict and answer_letter == predict: acc += 1 acc_difficulty[difficulty] += 1 - sample["status"] = "correct" - stats[level][key]["correct"] += 1 - stats[level][key]["difficulty"][difficulty]["correct"] += 1 - elif predict == None or predict == "": + sample['status'] = 'correct' + stats[level][key]['correct'] += 1 + stats[level][key]['difficulty'][difficulty]['correct'] += 1 + elif predict == None or predict == '': miss += 1 - sample["status"] = "miss" - stats[level][key]["miss"] += 1 + sample['status'] = 'miss' + stats[level][key]['miss'] += 1 elif predict == 'error': err += 1 - sample["status"] = "error" - stats[level][key]["error"] += 1 + sample['status'] = 'error' + stats[level][key]['error'] += 1 else: - sample["status"] = "incorrect" + sample['status'] = 'incorrect' count += 1 count_difficulty[difficulty] += 1 - details.append( - { - 'pred': sample['pred'], - 'answer': sample['answer'], - 'parsed_answer': sample['extracted_answer'], - 'correct': True if sample['status'] else False, - } - ) + details.append({ + 'pred': sample['pred'], + 'answer': sample['answer'], + 'parsed_answer': sample['extracted_answer'], + 'correct': True if sample['status'] else False, + }) return { - 'accuracy': acc / count if count > 0 else 0, - 'error_rate': err / count if count > 0 else 0, - 'miss_rate': miss / count if count > 0 else 0, - 'hard_accuracy': ( - acc_difficulty["hard"] / count_difficulty["hard"] - if count_difficulty["hard"] > 0 - else 0 - ), - 'middle_accuracy': ( - acc_difficulty["middle"] / count_difficulty["middle"] - if count_difficulty["middle"] > 0 - else 0 - ), - 'easy_accuracy': ( - acc_difficulty["easy"] / count_difficulty["easy"] - if count_difficulty["easy"] > 0 - else 0 - ), - 'details': details, + 'accuracy': + acc / count if count > 0 else 0, + 'error_rate': + err / count if count > 0 else 0, + 'miss_rate': + miss / count if count > 0 else 0, + 'hard_accuracy': + (acc_difficulty['hard'] / + count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0), + 'middle_accuracy': + (acc_difficulty['middle'] / count_difficulty['middle'] + if count_difficulty['middle'] > 0 else 0), + 'easy_accuracy': + (acc_difficulty['easy'] / + count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0), + 'details': + details, } diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py index 1cedc917..45c02e5a 100644 --- a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py +++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py @@ -1,7 +1,8 @@ import yaml -import uuid + class ConfigWrapper: + def __init__(self, config_path): self._config = {} with open(config_path, 'r') as file: @@ -15,37 +16,73 @@ class ConfigWrapper: else: self._config[key] = value super().__setattr__(key, value) - + def __getattr__(self, key): if key in self._config: return self._config[key] - raise AttributeError(f"'ConfigWrapper' object has no attribute '{key}'") - + raise AttributeError( + f"'ConfigWrapper' object has no attribute '{key}'") + def get_id(self, data): if isinstance(self._config.get('id_key'), str): return data.get(self._config.get('id_key'), None) elif isinstance(self._config.get('id_key'), list): - return '_'.join([str(data[key]) for key in self._config.get('id_key') if key in data]) + return '_'.join([ + str(data[key]) for key in self._config.get('id_key') + if key in data + ]) def print_all_keys(self): - print("config keys:") + print('config keys:') for key, value in self._config.items(): - print(f" - {key}: {value}") + print(f' - {key}: {value}') + config_wrapper = None + def initialize_config(config_path): global config_wrapper config_wrapper = ConfigWrapper(config_path) + def get_config_wrapper(): global config_wrapper if config_wrapper is None: - raise RuntimeError("ConfigWrapper not initialized. Call initialize_config first.") + raise RuntimeError( + 'ConfigWrapper not initialized. Call initialize_config first.') return config_wrapper + if __name__ == '__main__': config_path = 'config/config.yaml' initialize_config(config_path) - data = {'idx': '50', 'step':21, 'question': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\nPlease provide the decrypted answer, encapsulated in double square brackets. For example, the format should be: [[decrypted answer]].', 'answer': '[[P]]', 'category': 'Decryption', 'rule_id': '23', 'input': 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', 'steps_num': 23, 'description': 'For a number c=228 in the ciphertext:\nCalculate z = c^e mod n. Here ^ means multiplication.\nz is 80.\nBased on the decimal number represented by z, use the ascii code to find the corresponding letter as the plaintext letter p.\nPlease give the letter p in [[...]] format.\n', 'atom': 80} - print(config_wrapper.get_id(data)) \ No newline at end of file + data = { + 'idx': + '50', + 'step': + 21, + 'question': + 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n' + 'Please provide the decrypted answer, encapsulated in double square' + ' brackets. For example, the format should be: [[decrypted answer]].', + 'answer': + '[[P]]', + 'category': + 'Decryption', + 'rule_id': + '23', + 'input': + 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', + 'steps_num': + 23, + 'description': + 'For a number c=228 in the ciphertext:\n' + 'Calculate z = c^e mod n. Here ^ means multiplication.\nz is 80.' + '\nBased on the decimal number represented by z, use the ascii ' + 'code to find the corresponding letter as the plaintext letter p.' + '\nPlease give the letter p in [[...]] format.\n', + 'atom': + 80, + } + print(config_wrapper.get_id(data)) diff --git a/opencompass/datasets/supergpqa/supergpqa_eval.py b/opencompass/datasets/supergpqa/supergpqa_eval.py index 1d8ee724..e596a2f8 100644 --- a/opencompass/datasets/supergpqa/supergpqa_eval.py +++ b/opencompass/datasets/supergpqa/supergpqa_eval.py @@ -1,36 +1,31 @@ -import json +# flake8: noqa: W605 import re -import argparse -import os -from prettytable import PrettyTable -import pandas as pd -from tqdm import tqdm import timeout_decorator -import multiprocessing -import time -from functools import partial + @timeout_decorator.timeout(5) # 5 seconds timeout def safe_regex_search(pattern, text, flags=0): try: return re.search(pattern, text, flags) except timeout_decorator.TimeoutError: - print(f"Regex match timeout: pattern={pattern}, text={text[:100]}...") + print(f'Regex match timeout: pattern={pattern}, text={text[:100]}...') return None except Exception as e: - print(f"Regex match error: {str(e)}") + print(f'Regex match error: {str(e)}') return None + def extract_option_labels(text, options='ABCDEFGHIJ'): if not isinstance(text, str) or not isinstance(options, str): return 'error' - + text = text.rstrip() last_line = text.split('\n')[-1] - - option_str = ''.join([chr(65 + i) for i in range(len(options))]) if options else 'ABCDEFGHIJ' - + + option_str = ''.join([chr(65 + i) for i in range(len(options)) + ]) if options else 'ABCDEFGHIJ' + patterns = [ # e.g. "The final answer to this question is: A." # "The best option is $\boxed{B}:" @@ -41,7 +36,7 @@ def extract_option_labels(text, options='ABCDEFGHIJ'): # "Answer: $\boxed{B}." # "ANSWER: (C):" f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', - + # e.g. "A" # "$\boxed{B}$" # "(C)." @@ -53,46 +48,49 @@ def extract_option_labels(text, options='ABCDEFGHIJ'): match = safe_regex_search(pattern, last_line, re.IGNORECASE) if match: return match.group(1) - + for pattern in patterns: match = safe_regex_search(pattern, text, re.IGNORECASE) if match: return match.group(1) - + return None + def extract_option_content(text, options_content=None): if not isinstance(text, str) or not isinstance(options_content, list): return 'error' - - escaped_options_content = [re.escape(option_content) for option_content in options_content] + + escaped_options_content = [ + re.escape(option_content) for option_content in options_content + ] escaped_options_content_str = '|'.join(escaped_options_content) - + text = text.rstrip() last_line = text.split('\n')[-1] - + patterns = [ f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', - f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', - f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)', ] - + for pattern in patterns: match = safe_regex_search(pattern, last_line) if match: if match.group(1) in escaped_options_content: - return options_content[escaped_options_content.index(match.group(1))] + return options_content[escaped_options_content.index( + match.group(1))] else: return match.group(1) - + for pattern in patterns: match = safe_regex_search(pattern, text) if match: if match.group(1) in escaped_options_content: - return options_content[escaped_options_content.index(match.group(1))] + return options_content[escaped_options_content.index( + match.group(1))] else: return match.group(1) - + return None diff --git a/opencompass/datasets/supergpqa/supergpqa_utils.py b/opencompass/datasets/supergpqa/supergpqa_utils.py index 0279e422..c8913a9b 100644 --- a/opencompass/datasets/supergpqa/supergpqa_utils.py +++ b/opencompass/datasets/supergpqa/supergpqa_utils.py @@ -6,6 +6,7 @@ import sympy as sp import yaml from sympy.parsing.latex import parse_latex + def load_yaml(yaml_path): """Load a YAML file.""" if not os.path.exists(yaml_path): @@ -670,8 +671,7 @@ def evaluate_responses(data, mode, base_path=None): answer = record.get('gold', '') rule_id = record.get('rule_id', '') is_correct = evaluate_response_vs_answer(response, answer, - question_type, rule_id, - idx) + question_type, rule_id, idx) result_dict = { 'idx': idx, 'response': response, @@ -681,8 +681,10 @@ def evaluate_responses(data, mode, base_path=None): } if question_type == 'counterfactual': real_life_answer = record.get('real_life_answer', '') - is_real_life = evaluate_response_vs_answer( - response, real_life_answer, question_type, rule_id, idx) + is_real_life = evaluate_response_vs_answer(response, + real_life_answer, + question_type, rule_id, + idx) result_dict['real_life_answer'] = real_life_answer result_dict['is_real_life'] = is_real_life if question_type == 'cipher' and mode == 'subquestions': diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index cd225d13..f81437f2 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -47,9 +47,8 @@ class BaseEvaluator: # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200 return self._out_dir - def group( - self, n: int, details: List[Dict[str, Any]], test_set: Dataset - ) -> Dict[str, Any]: + def group(self, n: int, details: List[Dict[str, Any]], + test_set: Dataset) -> Dict[str, Any]: example2replications = {} for detail, example in zip(details, test_set): example_abbr = f"{example['subdivision']}_{example['idx']}" @@ -64,28 +63,23 @@ class BaseEvaluator: def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]: g_passk_details = OrderedDict() all_subdivisions = set( - [detail['example_abbr'].split('_')[0] for detail in details] - ) + [detail['example_abbr'].split('_')[0] for detail in details]) all_metrics = list(details[0].keys()) for subdivision in sorted(list(all_subdivisions)): for metric in all_metrics: if metric in ['predictions', 'example_abbr']: continue - g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean( - [ - detail[metric] - for detail in details - if detail['example_abbr'].split('_')[0] == subdivision - ] - ) + g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean([ + detail[metric] for detail in details + if detail['example_abbr'].split('_')[0] == subdivision + ]) for metric in all_metrics: if metric in ['predictions', 'example_abbr']: continue g_passk_details[metric] = 100.0 * np.mean( - [detail[metric] for detail in details] - ) + [detail[metric] for detail in details]) return g_passk_details def evaluate( @@ -104,7 +98,7 @@ class BaseEvaluator: if isinstance(x, Dataset): return x.select(range(i * real_size, (i + 1) * real_size)) elif isinstance(x, Iterable): - return x[i * real_size : (i + 1) * real_size] + return x[i * real_size:(i + 1) * real_size] else: return x @@ -112,8 +106,7 @@ class BaseEvaluator: **{ key: select_fn(i, real_size, value) for key, value in score_kwargs.items() - } - ) + }) details = results.pop('details', None) if details is not None: if isinstance(details, Dict): @@ -129,12 +122,10 @@ class BaseEvaluator: eval_results[key].append(single_results[key]) for key in deepcopy(eval_results): if isinstance(eval_results[key][0], float) or isinstance( - eval_results[key][0], int - ): + eval_results[key][0], int): if n > 1: eval_results[key + f' ({n} runs average)'] = np.mean( - eval_results[key] - ) + eval_results[key]) eval_results.pop(key) else: eval_results[key] = np.mean(eval_results[key]) @@ -163,22 +154,23 @@ class BaseEvaluator: thresholds = [0.0, 0.25, 0.5, 0.75, 1.0] for _k in [k] if isinstance(k, int) else k: for threshold in thresholds: - g_pass = compute_g_pass_at_k( - n=n, c=c, k=_k, t=threshold - ) + g_pass = compute_g_pass_at_k(n=n, + c=c, + k=_k, + t=threshold) detail[f'G-Pass@{_k}_{threshold}'] = g_pass - detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k( - n=n, c=c, k=_k - ) + detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k(n=n, + c=c, + k=_k) eval_details.append(detail) if can_calculate and n > 1 and k > 1: eval_results.update(self.reduce(eval_details)) - + # Store eval_details in eval_results eval_results['details'] = eval_details - + # Process details to flatten the predictions for detail in eval_details: # Extract all prediction fields and flatten them @@ -189,16 +181,17 @@ class BaseEvaluator: flattened_predictions[k] = [v] else: flattened_predictions[k].append(v) - + # Replace the predictions list with the flattened dictionary for k, v in flattened_predictions.items(): detail[k] = v - + # Remove the original predictions field detail.pop('predictions') - import ipdb; ipdb.set_trace() + import ipdb + ipdb.set_trace() return eval_results - + # If there are no details, return an empty dictionary return {}