diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index 7ce2b551..494c39fd 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -12,8 +12,8 @@ from mmengine.config import Config, DictAction from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg from opencompass.runners import SlurmRunner from opencompass.summarizers import DefaultSummarizer -from opencompass.utils import (LarkReporter, get_logger, read_from_station, - save_to_station) +from opencompass.utils import (LarkReporter, get_logger, pretty_print_config, + read_from_station, save_to_station) from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg, get_config_from_arg) @@ -94,6 +94,11 @@ def parse_args(): help='Use the custom config directory instead of config/ to ' 'search the configs for datasets, models and summarizers', type=str) + parser.add_argument( + '--config-verbose', + default=False, + action='store_true', + help='Whether to print the config in verbose mode.') parser.add_argument('-l', '--lark', help='Report the running status to lark bot', @@ -131,7 +136,7 @@ def parse_args(): 'correctness of each sample, bpb, etc.', action='store_true', ) - + # for the results persistence parser.add_argument('-sp', '--station-path', help='Path to your results station.', @@ -150,7 +155,12 @@ def parse_args(): 'data station.', action='store_true', ) - + # for evaluation with multiple runs + parser.add_argument('--dataset-num-runs', + help='How many runs for one dataset', + type=int, + default=1, + ) # set srun args slurm_parser = parser.add_argument_group('slurm_args') @@ -299,7 +309,10 @@ def main(): content = f'{getpass.getuser()}\'s task has been launched!' LarkReporter(cfg['lark_bot_url']).post(content) - logger.info(f'The full config is \n{cfg.pretty_text}') + + # print config if specified --config-verbose + if args.config_verbose: + pretty_print_config(cfg) # infer if args.mode in ['all', 'infer']: diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py b/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py index 5c180831..855c9b2a 100644 --- a/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py +++ b/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py @@ -98,12 +98,12 @@ for sub_set in sub_sets: olymmath_datasets.append( dict( type=OlymMATHDataset, - abbr=f'olymmath_llmjudge_{sub_set}', + abbr=f'olymmath_{sub_set}', path='RUC-AIBOX/OlymMATH', reader_cfg=math_reader_cfg, infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, subset=sub_set, - n=4 + n=1 ) ) diff --git a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py index 4ebc4ce1..caedfbaa 100644 --- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py +++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py @@ -109,8 +109,6 @@ for _name in categories: reader_cfg=olympiadbench_reader_cfg, infer_cfg=olympiadbench_infer_cfg, eval_cfg=olympiadbench_eval_cfg, - n=4, + n=1, ) ) - -del _name diff --git a/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py b/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py index 3a823d7a..64fb3565 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py +++ b/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py @@ -6,7 +6,7 @@ Setting: - CascadeEvaluator - MATHVerifyEvaluator - GenericLLMEvaluator - Repeat: 32 + Repeat: 1 Avaliable Models: - Instruct/Chat Models """ @@ -113,6 +113,6 @@ aime2024_datasets = [ reader_cfg=aime2024_reader_cfg, infer_cfg=aime2024_infer_cfg, eval_cfg=aime2024_eval_cfg, - n=32,# Evaluate the dataset with 2 times + n=1,# Evaluate the dataset with n times ) ] diff --git a/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py b/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py index 53e151e7..9fe5f557 100644 --- a/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py +++ b/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py @@ -6,7 +6,7 @@ Setting: - CascadeEvaluator - MATHVerifyEvaluator - GenericLLMEvaluator - Repeat: 32 + Repeat: 1 Avaliable Models: - Instruct/Chat Models """ @@ -66,7 +66,7 @@ GRADER_TEMPLATE = """ Judging the correctness of candidates' answers: """.strip() -aime2025_eval_cfg = dict( +cascade_evaluator = dict( type=CascadeEvaluator, rule_evaluator=dict( type=MATHVerifyEvaluator, @@ -98,6 +98,9 @@ aime2025_eval_cfg = dict( ), parallel=False, ) +aime2025_eval_cfg = dict( + evaluator=cascade_evaluator, +) aime2025_datasets = [ dict( @@ -107,5 +110,6 @@ aime2025_datasets = [ reader_cfg=aime2025_reader_cfg, infer_cfg=aime2025_infer_cfg, eval_cfg=aime2025_eval_cfg, + n=1, ) ] diff --git a/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py b/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py new file mode 100644 index 00000000..1a5d5735 --- /dev/null +++ b/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py @@ -0,0 +1,118 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess +from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils.text_postprocessors import match_answer_pattern + +# openai_simple_eval prompt +align_prompt = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +gpqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=align_prompt), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + + +gpqa_datasets = [] +gpqa_subsets = { + # 'extended': 'gpqa_extended.csv', + # 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_eval_cfg = dict( + evaluator=dict( + type=CascadeEvaluator, + rule_evaluator=dict( + type=AccEvaluator, + pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'), + ), + llm_evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + parallel=False, + ), + ) + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg, + mode='singlescore', + ) + ) diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py index fca6ace9..c903b4d6 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py @@ -114,6 +114,6 @@ livemathbench_datasets = [ ), ), ), - n=32, # repeat 32 times + n=1, # repeat n times ) for split in splits ] diff --git a/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py b/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py index e2fcf167..8c18b47b 100644 --- a/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py +++ b/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py @@ -112,6 +112,6 @@ math_datasets = [ eval_cfg=dict( evaluator=cascade_evaluator, ), - n=4, + n=1, ) ] diff --git a/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py b/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py new file mode 100644 index 00000000..76e40e1a --- /dev/null +++ b/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py @@ -0,0 +1,126 @@ +""" +Setting: 0-shot No-CoT +Evaluator: GenericLLMEvaluator +""" +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.evaluator import ( + CascadeEvaluator, + GenericLLMEvaluator, +) + +with read_base(): + # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets + from .mmlu_stem_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=CascadeEvaluator, + rule_evaluator=dict( + type=AccEvaluator, + pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'), + ), + llm_evaluator = dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + judge_cfg=dict(), + ), + ), + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + )) diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py index 1ccbe9fd..75ac3164 100644 --- a/opencompass/datasets/base.py +++ b/opencompass/datasets/base.py @@ -3,6 +3,9 @@ from typing import Dict, List, Optional, Union from datasets import Dataset, DatasetDict, concatenate_datasets from opencompass.openicl import DatasetReader +from opencompass.utils import get_logger + +logger = get_logger() class BaseDataset: diff --git a/opencompass/datasets/teval/evaluators/review_evaluator.py b/opencompass/datasets/teval/evaluators/review_evaluator.py index 68a14668..b68a76dc 100644 --- a/opencompass/datasets/teval/evaluators/review_evaluator.py +++ b/opencompass/datasets/teval/evaluators/review_evaluator.py @@ -76,7 +76,6 @@ class ReviewEvaluator: pred_data = data_sample.pred if pred_data is not None: - # import pdb; pdb.set_trace() metrics_result['review_quality'] = 1.0 if pred_data == \ data_sample.gt else 0.0 metrics_result['parse_rate'] = 1.0 diff --git a/opencompass/evaluator/cascade_evaluator.py b/opencompass/evaluator/cascade_evaluator.py index fc68e24f..8d86fe1b 100644 --- a/opencompass/evaluator/cascade_evaluator.py +++ b/opencompass/evaluator/cascade_evaluator.py @@ -239,6 +239,9 @@ class CascadeEvaluator(BaseEvaluator): # Update the details for samples that were evaluated by LLM for i, llm_detail in enumerate(llm_details.values()): + # Add dataset replica index to LLM evaluation result + llm_detail['dataset_replica_idx'] = self.dataset_replica_idx + original_index = failed_indices[i] # Store original rule-based evaluation result rule_result = details[original_index].copy() diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py index 65db2061..4c101b34 100644 --- a/opencompass/evaluator/generic_llm_evaluator.py +++ b/opencompass/evaluator/generic_llm_evaluator.py @@ -99,7 +99,6 @@ class GenericLLMEvaluator(BaseEvaluator): assert len(predictions) == len( references), 'predictions and references must have the same length' - # import pdb;pdb.set_trace() # -------------- Build Inferencer ---------------- self.build_inferencer() # ---------------- Process Predictions ------------------ diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index 52eb7df7..1f605d4e 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -8,6 +8,8 @@ import numpy as np from datasets import Dataset from scipy.stats import hypergeom +from opencompass.registry import TEXT_POSTPROCESSORS + def compute_pass_at_k(n, c, k): if n - c < k: @@ -39,8 +41,8 @@ def compute_mg_pass_at_k(n, c, k): class BaseEvaluator: - def __init__(self) -> None: - pass + def __init__(self, pred_postprocessor=None) -> None: + self.pred_postprocessor = pred_postprocessor @property def output_dir(self): @@ -86,6 +88,14 @@ class BaseEvaluator: [detail[metric] for detail in details]) return g_passk_details + def pred_postprocess(self, predictions: List) -> Dict: + if self.pred_postprocessor is None: + return predictions + else: + kwargs = self.pred_postprocessor + proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) + return [proc(pred, **kwargs) for pred in predictions] + def evaluate( self, k: Union[int, List[int]], diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py index 27270b95..f93b5bcf 100644 --- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py @@ -1,10 +1,11 @@ import os import random -from typing import List +from typing import List, Optional import evaluate import numpy as np from datasets import Dataset +from mmengine.config import ConfigDict from opencompass.registry import ICL_EVALUATORS @@ -19,12 +20,16 @@ class HuggingfaceEvaluator(BaseEvaluator): seed (int): There exists some randomness during the calculation of some metrics, thus we set a fixed random seed for reproducing. Defaults to 0. + pred_postprocessor (optional): 用于预测后处理的函数或配置。 """ - def __init__(self, metric: str, seed: int = 0) -> None: + def __init__(self, + metric: str, + seed: int = 0, + pred_postprocessor=None) -> None: self.metric = metric self.seed = seed - super().__init__() + super().__init__(pred_postprocessor=pred_postprocessor) def _preprocess(self, predictions: List, references: List) -> dict: """Preprocess the final predictions and references to needed format. @@ -37,7 +42,7 @@ class HuggingfaceEvaluator(BaseEvaluator): dict: preprocessed results. """ return { - 'predictions': predictions, + 'predictions': self.pred_postprocess(predictions), 'references': references, } @@ -92,8 +97,10 @@ class HuggingfaceEvaluator(BaseEvaluator): class AccEvaluator(HuggingfaceEvaluator): """Accuracy evaluator.""" - def __init__(self) -> None: - super().__init__(metric='accuracy') + def __init__(self, + pred_postprocessor: Optional[ConfigDict] = None) -> None: + super().__init__(metric='accuracy', + pred_postprocessor=pred_postprocessor) def _preprocess(self, predictions: List, references: List) -> dict: """Preprocess the final predictions and references to needed format. @@ -187,8 +194,9 @@ class RougeEvaluator(HuggingfaceEvaluator): Note: this evaluator is not suitable for chinese datasets. """ - def __init__(self) -> None: - super().__init__(metric='rouge') + def __init__(self, + pred_postprocessor: Optional[ConfigDict] = None) -> None: + super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor) def _postprocess(self, scores: dict) -> dict: """Postprocess for final scores. @@ -206,8 +214,10 @@ class RougeEvaluator(HuggingfaceEvaluator): class BleuEvaluator(HuggingfaceEvaluator): """Bleu evaluator.""" - def __init__(self) -> None: - super().__init__(metric='sacrebleu') + def __init__(self, + pred_postprocessor: Optional[ConfigDict] = None) -> None: + super().__init__(metric='sacrebleu', + pred_postprocessor=pred_postprocessor) class BleuFloresEvaluator(HuggingfaceEvaluator): diff --git a/opencompass/partitioners/num_worker.py b/opencompass/partitioners/num_worker.py index e916a17d..bcae7279 100644 --- a/opencompass/partitioners/num_worker.py +++ b/opencompass/partitioners/num_worker.py @@ -26,6 +26,7 @@ class NumWorkerPartitioner(BasePartitioner): dataset_size_path (str): The path to the dataset size cache file. keep_keys (list[str]): The keys to be kept from the experiment config to the task config. + force_rebuild (bool): Whether to force rebuild dataset to get size. """ def __init__(self, @@ -35,7 +36,8 @@ class NumWorkerPartitioner(BasePartitioner): min_task_size: int = 16, strategy: str = 'heuristic', dataset_size_path: str = '.cache/dataset_size.json', - keep_keys: Optional[List[str]] = None): + keep_keys: Optional[List[str]] = None, + force_rebuild: bool = False): super().__init__(out_dir=out_dir, keep_keys=keep_keys) if strategy == 'split' and num_worker is not None: self.logger.warning('num_worker is ignored with split.') @@ -44,6 +46,7 @@ class NumWorkerPartitioner(BasePartitioner): self.num_split = num_split or num_worker self.min_task_size = min_task_size self.dataset_size_path = dataset_size_path + self.force_rebuild = force_rebuild assert strategy in ('heuristic', 'split'), \ f'Unsupported partition strategy: {strategy}. '\ 'Supported strategies are: `heuristic`, `split` .' @@ -106,7 +109,7 @@ class NumWorkerPartitioner(BasePartitioner): @property def dataset_size(self): if not hasattr(self, '_dataset_size'): - if osp.exists(self.dataset_size_path): + if not self.force_rebuild and osp.exists(self.dataset_size_path): self._dataset_size = mmengine.load(self.dataset_size_path) else: self._dataset_size = {} @@ -130,22 +133,25 @@ class NumWorkerPartitioner(BasePartitioner): def get_size(self, dataset: ConfigDict) -> int: dataset_abbr = dataset_abbr_from_cfg(dataset) - test_range = dataset.reader_cfg.get('test_range', '') - if dataset_abbr in self.dataset_size: + # 如果不强制重建且缓存中有数据,则使用缓存 + if not self.force_rebuild and dataset_abbr in self.dataset_size: actual_size = eval('len(range(self.dataset_size[dataset_abbr])' f'{test_range})') return actual_size + # 否则重新构建数据集获取大小 dataset = build_dataset_from_cfg(dataset) self.dataset_size[dataset_abbr] = len(dataset.test) - mmengine.mkdir_or_exist('.cache/') - mmengine.dump(self.dataset_size, - self.dataset_size_path, - indent=4, - ensure_ascii=False) + # 保存到缓存文件 + if self.dataset_size_path: + mmengine.mkdir_or_exist('.cache/') + mmengine.dump(self.dataset_size, + self.dataset_size_path, + indent=4, + ensure_ascii=False) actual_size = eval('len(range(self.dataset_size[dataset_abbr])' f'{test_range})') diff --git a/opencompass/utils/logging.py b/opencompass/utils/logging.py index 5250c918..b0dbdcd4 100644 --- a/opencompass/utils/logging.py +++ b/opencompass/utils/logging.py @@ -2,6 +2,8 @@ import logging import os from mmengine.logging import MMLogger +from rich.console import Console +from rich.syntax import Syntax _nameToLevel = { 'CRITICAL': logging.CRITICAL, @@ -79,3 +81,14 @@ class FilterDuplicateMessage(logging.Filter): self.seen.add(record.msg) return True return False + + +def pretty_print_config(cfg): + """使用rich库美化配置输出.""" + console = Console() + config_str = cfg.pretty_text + syntax = Syntax(config_str, + 'python', + theme='solarized-dark', + line_numbers=True) + console.print(syntax) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index a84d4469..31cabacd 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -150,6 +150,12 @@ def get_config_from_arg(args) -> Config: dataset['meta_path'] = args.custom_dataset_meta_path dataset = make_custom_dataset_config(dataset) datasets.append(dataset) + ## apply the dataset repeat runs + if len(datasets) > 0 and args.dataset_num_runs > 1: + logger.warning(f'User has set the --dataset-num-runs, the datasets will be evaluated with {args.dataset_num_runs} runs.') + for _dataset in datasets: + logger.warning(f"The default num runs of {_dataset['abbr']} is: {_dataset['n']}, changed into: {args.dataset_num_runs}") + _dataset['n'] = args.dataset_num_runs # parse model args if not args.models and not args.hf_path: