diff --git a/opencompass/datasets/needlebench/atc.py b/opencompass/datasets/needlebench/atc.py index b4e36128..2143bc57 100644 --- a/opencompass/datasets/needlebench/atc.py +++ b/opencompass/datasets/needlebench/atc.py @@ -4,20 +4,24 @@ import os import random import re from enum import Enum + from datasets import Dataset from opencompass.datasets.base import BaseDataset -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS -from opencompass.datasets.needlebench.atc_elder_only import clean_atc_answer, needlebench_atc_postprocess_v2, NeedleBenchATCEvaluator +from opencompass.datasets.needlebench.atc_elder_only import ( + NeedleBenchATCEvaluator, clean_atc_answer, needlebench_atc_postprocess_v2) +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) from opencompass.utils import get_data_path # 定义问题类型枚举 class QuestionType(Enum): - ELDEST_ANCESTOR = 0 # 最年长祖先 - NTH_ANCESTOR = 1 # N级祖先 - NTH_DESCENDANT = 2 # N级子节点 - RELATIONSHIP_DISTANCE = 3 # 关系距离 + ELDEST_ANCESTOR = 0 # 最年长祖先 + NTH_ANCESTOR = 1 # N级祖先 + NTH_DESCENDANT = 2 # N级子节点 + RELATIONSHIP_DISTANCE = 3 # 关系距离 + # 定义关系术语的代数映射(一代关系还是两代关系) relationship_generation_map_zh = { @@ -92,7 +96,7 @@ relationship_templates_en = [ "but also {B}'s guardian."), ('For {B}, {A} is not just a {relationship}, ' 'but also a friend.'), - "For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.", + 'For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.', ] # Eldest ancestor problem template @@ -249,23 +253,24 @@ Now, the scrambled family relationships are provided below: Given the scrambled family relationships described above, what is the relationship distance between '{person_a}' and '{person_b}'? """ + @LOAD_DATASET.register_module() class NeedleBenchATCDataset(BaseDataset): @staticmethod def load( - path, - file_name: str, - num_needles: int, - language: str, - repeats: int, - # This parameter cannot be passed through mmengine because it is blocked as lazy - question_types: list[QuestionType] = [ - QuestionType.ELDEST_ANCESTOR, - QuestionType.NTH_ANCESTOR, - QuestionType.NTH_DESCENDANT, - QuestionType.RELATIONSHIP_DISTANCE, - ], # Support specifying a list of question types + path, + file_name: str, + num_needles: int, + language: str, + repeats: int, + # This parameter cannot be passed through mmengine because it is blocked as lazy + question_types: list[QuestionType] = [ + QuestionType.ELDEST_ANCESTOR, + QuestionType.NTH_ANCESTOR, + QuestionType.NTH_DESCENDANT, + QuestionType.RELATIONSHIP_DISTANCE, + ], # Support specifying a list of question types ): data = {'prompt': [], 'answer': [], 'question_type': []} path = get_data_path(path) @@ -282,7 +287,7 @@ class NeedleBenchATCDataset(BaseDataset): # Ensure question_types is not empty if not question_types: raise ValueError('question_types cannot be empty') - + for question_type in question_types: # Generate the specified number of examples for each question type for i in range(repeats): @@ -290,11 +295,11 @@ class NeedleBenchATCDataset(BaseDataset): # Use the enum value of the question type multiplied by 10000 as the base to ensure non-overlapping seed ranges seed = (i + 1) + (10000 * question_type.value) random.seed(seed) - + # Randomly select the specified number of names from all names # The number of names is num_needles + 1 - names = random.sample(all_names, num_needles+1) - + names = random.sample(all_names, num_needles + 1) + # Select the corresponding relationship terms and templates according to the language if language == 'Chinese': relationship_terms = relationship_terms_zh_CN @@ -305,10 +310,13 @@ class NeedleBenchATCDataset(BaseDataset): relationship_templates = relationship_templates_en relationship_map = relationship_generation_map_en else: - raise ValueError('Unsupported language specified. ' - 'Please choose either "Chinese" or "English".') + raise ValueError( + 'Unsupported language specified. ' + 'Please choose either "Chinese" or "English".') - def generate_chain_family_story(names, templates, relationship_terms, relationship_map): + def generate_chain_family_story(names, templates, + relationship_terms, + relationship_map): story = '' relationships = [] total_generations = 0 # Track the total generational difference @@ -317,25 +325,30 @@ class NeedleBenchATCDataset(BaseDataset): template = random.choice(templates) relation_term = random.choice(relationship_terms) relation = template.format(A=names[i], - B=names[i + 1], - relationship=relation_term) + B=names[i + 1], + relationship=relation_term) story += f'{relation}*' # Get the generation difference for this relationship - gen_diff = relationship_map.get(relation_term, 1) # Default to 1 generation + gen_diff = relationship_map.get( + relation_term, 1) # Default to 1 generation total_generations += gen_diff # Record relationship information for later use - relationships.append((names[i], names[i + 1], relation_term, gen_diff)) + relationships.append( + (names[i], names[i + 1], relation_term, gen_diff)) return story, relationships, total_generations chain_story, relationships, total_generations = generate_chain_family_story( - names, relationship_templates, relationship_terms, relationship_map) + names, relationship_templates, relationship_terms, + relationship_map) # Split the chain_story into a list of fragments family_story_fragments = chain_story.split('*') - family_story_fragments = [f for f in family_story_fragments if f] + family_story_fragments = [ + f for f in family_story_fragments if f + ] # Shuffle the list of fragments random.shuffle(family_story_fragments) @@ -348,15 +361,19 @@ class NeedleBenchATCDataset(BaseDataset): last_person = names[-1] if language == 'Chinese': prompt = shuffled_story_with_prompt_zh_CN.format( - shuffled_story=shuffled_story, last_person=last_person) + shuffled_story=shuffled_story, + last_person=last_person) else: prompt = shuffled_story_with_prompt_en.format( - shuffled_story=shuffled_story, last_person=last_person) - answer = names[0] # The first person is the eldest ancestor + shuffled_story=shuffled_story, + last_person=last_person) + answer = names[ + 0] # The first person is the eldest ancestor elif question_type == QuestionType.NTH_ANCESTOR: # Nth ancestor question - trace from the youngest person to the oldest - person = names[-1] # The youngest person (end of the chain) + person = names[ + -1] # The youngest person (end of the chain) n = total_generations # Use the calculated total generational difference if language == 'Chinese': prompt = nth_ancestor_prompt_zh_CN.format( @@ -364,7 +381,8 @@ class NeedleBenchATCDataset(BaseDataset): else: prompt = nth_ancestor_prompt_en.format( shuffled_story=shuffled_story, person=person, n=n) - answer = names[0] # The oldest person (start of the chain) is the nth ancestor + answer = names[ + 0] # The oldest person (start of the chain) is the nth ancestor elif question_type == QuestionType.NTH_DESCENDANT: # Nth descendant question - trace from the oldest person to the youngest @@ -376,7 +394,8 @@ class NeedleBenchATCDataset(BaseDataset): else: prompt = nth_descendant_prompt_en.format( shuffled_story=shuffled_story, person=person, n=n) - answer = names[-1] # The youngest person (end of the chain) is the nth descendant + answer = names[ + -1] # The youngest person (end of the chain) is the nth descendant elif question_type == QuestionType.RELATIONSHIP_DISTANCE: # Relationship distance question - calculate the relationship distance between the two ends of the chain @@ -384,10 +403,14 @@ class NeedleBenchATCDataset(BaseDataset): person_b = names[-1] # The youngest person if language == 'Chinese': prompt = relationship_distance_prompt_zh_CN.format( - shuffled_story=shuffled_story, person_a=person_a, person_b=person_b) + shuffled_story=shuffled_story, + person_a=person_a, + person_b=person_b) else: prompt = relationship_distance_prompt_en.format( - shuffled_story=shuffled_story, person_a=person_a, person_b=person_b) + shuffled_story=shuffled_story, + person_a=person_a, + person_b=person_b) # Use the calculated total generations as the relationship distance answer = str(total_generations) @@ -396,11 +419,14 @@ class NeedleBenchATCDataset(BaseDataset): last_person = names[-1] if language == 'Chinese': prompt = shuffled_story_with_prompt_zh_CN.format( - shuffled_story=shuffled_story, last_person=last_person) + shuffled_story=shuffled_story, + last_person=last_person) else: prompt = shuffled_story_with_prompt_en.format( - shuffled_story=shuffled_story, last_person=last_person) - answer = names[0] # The first person is the eldest ancestor + shuffled_story=shuffled_story, + last_person=last_person) + answer = names[ + 0] # The first person is the eldest ancestor data['prompt'].append(prompt) data['answer'].append(answer) @@ -411,4 +437,4 @@ class NeedleBenchATCDataset(BaseDataset): 'answer': data['answer'], 'question_type': data['question_type'], }) - return dataset \ No newline at end of file + return dataset diff --git a/opencompass/datasets/needlebench/atc_choice.py b/opencompass/datasets/needlebench/atc_choice.py index aa4dbd63..df3991fa 100644 --- a/opencompass/datasets/needlebench/atc_choice.py +++ b/opencompass/datasets/needlebench/atc_choice.py @@ -4,14 +4,16 @@ import json import os import random -from datasets import Dataset import numpy as np +from datasets import Dataset from opencompass.registry import LOAD_DATASET from opencompass.utils import get_data_path from ..base import BaseDataset -from .atc import relationship_terms_zh_CN, relationship_templates_zh_CN, relationship_terms_en, relationship_templates_en +from .atc import (relationship_templates_en, relationship_templates_zh_CN, + relationship_terms_en, relationship_terms_zh_CN) + def get_number(options): result_string = '' @@ -173,10 +175,13 @@ Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandm ) names.extend(additional_names) - num_samples = 3 + num_samples = 3 if len(names) > 1: - indices = np.linspace(1, len(names) - 1, num_samples, dtype=int) # Generate evenly spaced indices - sampled_names = [names[i] for i in indices] # Select corresponding elements + indices = np.linspace( + 1, len(names) - 1, num_samples, + dtype=int) # Generate evenly spaced indices + sampled_names = [names[i] for i in indices + ] # Select corresponding elements entry['options'] = names[:1] + sampled_names else: entry['options'] = names # Return directly if only one element diff --git a/opencompass/datasets/needlebench/atc_elder_only.py b/opencompass/datasets/needlebench/atc_elder_only.py index 458ce337..b8e0e5ba 100644 --- a/opencompass/datasets/needlebench/atc_elder_only.py +++ b/opencompass/datasets/needlebench/atc_elder_only.py @@ -3,14 +3,15 @@ import json import os import random import re + from datasets import Dataset from opencompass.datasets.base import BaseDataset -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS -from opencompass.openicl.icl_evaluator import BaseEvaluator - -from opencompass.utils import get_data_path from opencompass.datasets.math import extract_boxed_answer +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path relationship_templates_zh_CN = [ '{A}是{B}的{relationship}。', @@ -57,7 +58,7 @@ relationship_templates_en = [ '{B} is the child of {A}.', ('For {B}, {A} is not just a {relationship}, ' 'but also a friend.'), - "For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.", + 'For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.', ] shuffled_story_with_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做祖先追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,直到找到最年长的祖先。 @@ -125,7 +126,7 @@ class NeedleBenchATCDataset(BaseDataset): # 使用固定种子来保持样本稳定性 seed = i random.seed(seed) - + names = random.sample(all_names, num_needles) if language == 'Chinese': relationship_terms = relationship_terms_zh_CN @@ -163,9 +164,11 @@ class NeedleBenchATCDataset(BaseDataset): # Generating the prompt based on the language if language == 'Chinese': - shuffled_story_with_prompt = shuffled_story_with_prompt_zh_CN.format(shuffled_story=shuffled_story, last_person=last_person) + shuffled_story_with_prompt = shuffled_story_with_prompt_zh_CN.format( + shuffled_story=shuffled_story, last_person=last_person) elif language == 'English': - shuffled_story_with_prompt = shuffled_story_with_prompt_en.format(shuffled_story=shuffled_story, last_person=last_person) + shuffled_story_with_prompt = shuffled_story_with_prompt_en.format( + shuffled_story=shuffled_story, last_person=last_person) else: prompt = 'Language not supported.' raise Exception('Unsupported language specified. ' @@ -182,46 +185,47 @@ class NeedleBenchATCDataset(BaseDataset): def clean_atc_answer(text: str) -> str: - """Clean answer format specifically for QwQ-32B-Preview model - + """Clean answer format specifically for QwQ-32B-Preview model. + Args: text: Raw prediction text - + Returns: Standardized name format after cleaning """ - if not text or text == "None": - return "None" - + if not text or text == 'None': + return 'None' + # Remove LaTeX commands but keep content text = re.sub(r'\\text\{([^}]+)\}', r'\1', text) text = re.sub(r'\\boxed\{([^}]+)\}', r'\1', text) text = re.sub(r'\\[\[\]]', '', text) - + # Remove extra backslashes text = text.replace('\\\\', '').replace('\\', '') - + # Handle extra spaces text = re.sub(r'\s+', ' ', text).strip() - + # Remove quotes text = text.replace('"', '').replace("'", '') # Remove tildes (波浪符号) text = text.replace('~', ' ') - + return text + @TEXT_POSTPROCESSORS.register_module('needlebench_atc_postprocess_v2') def needlebench_atc_postprocess_v2(text: str) -> str: cand_ans = extract_boxed_answer(text, strip_double_curly_brace=True) - + if cand_ans: return clean_atc_answer(cand_ans) - return "None" + return 'None' -@ICL_EVALUATORS.register_module("needlebench_atc_evaluator") +@ICL_EVALUATORS.register_module('needlebench_atc_evaluator') class NeedleBenchATCEvaluator(BaseEvaluator): def score(self, predictions, gold): @@ -230,12 +234,12 @@ class NeedleBenchATCEvaluator(BaseEvaluator): correct_count = 0 details = [] - + for prediction, reference in zip(predictions, gold): reference_name = reference if prediction.strip() == reference_name.strip(): correct_count += 1 - + detail = { 'pred': prediction, 'answer': reference_name, @@ -243,6 +247,7 @@ class NeedleBenchATCEvaluator(BaseEvaluator): } details.append(detail) - accuracy = (correct_count / len(predictions)) * 100 if predictions else 0 + accuracy = (correct_count / + len(predictions)) * 100 if predictions else 0 result = {'score': accuracy, 'details': details} - return result \ No newline at end of file + return result diff --git a/opencompass/datasets/needlebench/multi.py b/opencompass/datasets/needlebench/multi.py index f915a78a..3dd9e011 100644 --- a/opencompass/datasets/needlebench/multi.py +++ b/opencompass/datasets/needlebench/multi.py @@ -7,8 +7,12 @@ from datasets import Dataset from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset +from opencompass.datasets.needlebench.atc import (relationship_templates_en, + relationship_templates_zh_CN, + relationship_terms_en, + relationship_terms_zh_CN) from opencompass.registry import LOAD_DATASET -from opencompass.datasets.needlebench.atc import relationship_templates_zh_CN, relationship_terms_zh_CN, relationship_templates_en, relationship_terms_en + def get_random_needles(counter, file_path, num_needles, language): with open(file_path, 'r', encoding='utf-8') as file: @@ -33,18 +37,23 @@ def get_random_needles(counter, file_path, num_needles, language): for i in range(len(names) - 1): template = random.choice(templates) relation_term = random.choice(relationship_terms) - relation = template.format(A=names[i], B=names[i + 1], relationship=relation_term) + relation = template.format(A=names[i], + B=names[i + 1], + relationship=relation_term) story += f'{relation}*' return story - chain_story = generate_chain_family_story(names, relationship_templates, relationship_terms) - + chain_story = generate_chain_family_story(names, relationship_templates, + relationship_terms) + # Splitting the chain_story into a list of fragments family_story_fragments = chain_story.split('*') # Removing the empty string from the list - family_story_fragments = [fragment for fragment in family_story_fragments if fragment] - + family_story_fragments = [ + fragment for fragment in family_story_fragments if fragment + ] + # Shuffling the list of fragments random.shuffle(family_story_fragments) @@ -55,7 +64,7 @@ def get_random_needles(counter, file_path, num_needles, language): retrieval_question = f"在上面提供的文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?" elif language == 'English': retrieval_question = f"Given the context described above, who is the eldest relative that '{last_person}' can trace back to in the context?" - + # Returning the story, answer, and retrieval question return { 'needles': family_story_fragments, @@ -65,7 +74,6 @@ def get_random_needles(counter, file_path, num_needles, language): } - @LOAD_DATASET.register_module() class NeedleBenchMultiDataset(BaseDataset): @@ -216,8 +224,9 @@ The content of the long document is as follows ''' else: - raise ValueError(f'Unsupported quesiton_position {quesiton_position}. ' - 'Position must be "End" or "Start".') + raise ValueError( + f'Unsupported quesiton_position {quesiton_position}. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.") @@ -225,7 +234,7 @@ The content of the long document is as follows repo_id = 'opencompass/NeedleBench' file_names = [ - 'PaulGrahamEssays.jsonl','names.json', 'zh_finance.jsonl', + 'PaulGrahamEssays.jsonl', 'names.json', 'zh_finance.jsonl', 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', 'zh_movie.jsonl', 'zh_tech.jsonl' ] @@ -250,7 +259,7 @@ The content of the long document is as follows random.seed(counter) random.shuffle(lines) random_needle_data = get_random_needles( - counter, needle_file_path, num_needles+1, language) + counter, needle_file_path, num_needles + 1, language) last_person = random_needle_data['last_person'] needles = [ '\n' + needle + '\n' @@ -278,7 +287,8 @@ The content of the long document is as follows needles) processed_prompt = _generate_prompt(processed_text, - retrieval_question, last_person) + retrieval_question, + last_person) data['prompt'].append(processed_prompt) data['answer'].append(keyword) @@ -287,4 +297,4 @@ The content of the long document is as follows 'prompt': data['prompt'], 'answer': data['answer'], }) - return dataset \ No newline at end of file + return dataset diff --git a/opencompass/datasets/needlebench/origin.py b/opencompass/datasets/needlebench/origin.py index e6f281eb..4d13fba0 100644 --- a/opencompass/datasets/needlebench/origin.py +++ b/opencompass/datasets/needlebench/origin.py @@ -114,8 +114,9 @@ The content of the long document is as follows ''' else: - raise ValueError(f'Unsupported quesiton_position {quesiton_position}. ' - 'Position must be "End" or "Start".') + raise ValueError( + f'Unsupported quesiton_position {quesiton_position}. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.") @@ -201,11 +202,7 @@ class NeedleBenchOriginEvaluator(BaseEvaluator): else: score = 0 - detail = { - 'pred': prediction, - 'answer': reference, - 'score': score - } + detail = {'pred': prediction, 'answer': reference, 'score': score} total_score += score details.append(detail) diff --git a/opencompass/datasets/needlebench/parallel.py b/opencompass/datasets/needlebench/parallel.py index e4db7aba..97d7ed20 100644 --- a/opencompass/datasets/needlebench/parallel.py +++ b/opencompass/datasets/needlebench/parallel.py @@ -158,8 +158,9 @@ class NeedleBenchParallelDataset(BaseDataset): ''' else: - raise ValueError(f'Unsupported quesiton_position {quesiton_position}. ' - 'Position must be "End" or "Start".') + raise ValueError( + f'Unsupported quesiton_position {quesiton_position}. ' + 'Position must be "End" or "Start".') elif language == 'English': if quesiton_position == 'End': prompt = f'''This is a test of long-text capability. You need to first read the long document below, and then answer the final questions one by one based on the information in the document. @@ -183,8 +184,9 @@ The content of the long document is as follows ''' else: - raise ValueError(f'Unsupported quesiton_position {quesiton_position}. ' - 'Position must be "End" or "Start".') + raise ValueError( + f'Unsupported quesiton_position {quesiton_position}. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.") @@ -269,6 +271,7 @@ The content of the long document is as follows class NeedleBenchParallelEvaluator(BaseEvaluator): + def score(self, predictions, gold): if len(predictions) != len(gold): return {'error': 'predictions and gold have different lengths'}