From e49fcfd3a35b28df709d43bf598bf9ebded5a269 Mon Sep 17 00:00:00 2001 From: liushz Date: Mon, 25 Nov 2024 15:14:55 +0800 Subject: [PATCH 01/19] [Update] Update MATH dataset with model judge (#1711) * Update math with llm judge * Update math with llm judge * Update math with llm judge * Update math with llm judge * Update math with llm judge --- configs/eval_math_llm_judge_internal.py | 47 +++++++ .../math_0shot_llm_judge_v2_gen_31d777.py | 51 ++++++++ opencompass/datasets/compassbench_obj.py | 29 +++++ opencompass/datasets/gaokao_math.py | 118 +++++++++++++++--- opencompass/utils/model_postprocessors.py | 2 +- 5 files changed, 226 insertions(+), 21 deletions(-) create mode 100644 configs/eval_math_llm_judge_internal.py create mode 100644 opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py diff --git a/configs/eval_math_llm_judge_internal.py b/configs/eval_math_llm_judge_internal.py new file mode 100644 index 00000000..325eb3a1 --- /dev/null +++ b/configs/eval_math_llm_judge_internal.py @@ -0,0 +1,47 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import math_datasets + + # 选择一个感兴趣的模型 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as qwen2_5_72b_instruct_model + +eval_model_name = 'eval_model_name' +postprocessor_model_name = 'postprocessor_model_name' +eval_model_urls = ['http://0.0.0.0:23333/v1'] +postprocessor_model_urls = ['http://0.0.0.0:23333/v1'] + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + +for dataset in datasets: + dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name + dataset['eval_cfg']['evaluator']['url'] = eval_model_urls + dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls + dataset['eval_cfg']['evaluator']['post_model_name'] = postprocessor_model_name + + +# -------------Inferen Stage ---------------------------------------- + +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=8, + task=dict(type=OpenICLInferTask) + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=256, + task=dict(type=OpenICLEvalTask) + ), +) diff --git a/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py b/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py new file mode 100644 index 00000000..b56db5db --- /dev/null +++ b/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, GaoKaoMATHEvaluator + +# ----------------------------- Model Eval Parameters ----------------------------- + +naive_model_name = 'dlc_model' # replace with your model name +naive_model_url = ['http://0.0.0.0:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), +) + +evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + language='en', + with_postprocess=True, + post_url=naive_model_url, + post_model_name=naive_model_name, +) + +math_eval_cfg = dict( + evaluator=evaluator, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/datasets/compassbench_obj.py b/opencompass/datasets/compassbench_obj.py index 044b20d9..f2ac4f17 100644 --- a/opencompass/datasets/compassbench_obj.py +++ b/opencompass/datasets/compassbench_obj.py @@ -63,6 +63,35 @@ class CompassBenchObjectiveV1_3(BaseDataset): return dataset +@LOAD_DATASET.register_module() +class CompassBenchObjectiveMath(BaseDataset): + + @staticmethod + def load(path: str): + with open(path, 'r') as infile: + data = [json.loads(line) for line in infile] + for idx in range(len(data)): + item = data[idx] + prefix = '' + if item.get('question_type', + None) and item['question_type'] in [ + 'multiple-answer', '多选题' + ]: + if '_en_' in path: + prefix = 'This question may has multiple answers, \ +please select all correct answers. like this: A, B, C as your final answer\n' + + else: + prefix = '这道题可能有多个正确答案,请选择所有正确的答案,\ +例如:A, B, C 作为你的最终答案\n' + + if item.get('options', None) and len(item['options']) != 0: + item['question'] = prefix + item[ + 'question'] + '\n' + get_number(item['options']) + dataset = Dataset.from_list(data) + return dataset + + @TEXT_POSTPROCESSORS.register_module() def compassbench_objective_v1_3_postprocess(text: str, name) -> str: split = False diff --git a/opencompass/datasets/gaokao_math.py b/opencompass/datasets/gaokao_math.py index 87840b71..a9f79db4 100644 --- a/opencompass/datasets/gaokao_math.py +++ b/opencompass/datasets/gaokao_math.py @@ -12,7 +12,6 @@ from .base import BaseDataset # from opencompass.utils import get_data_path - EVAL_PROMPT = """ 请你作为一个数学高考阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准: 1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题等,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。 @@ -27,6 +26,42 @@ EVAL_PROMPT = """ 分析: """ # noqa E501 +POST_PROMPT_CN=""" +你是一个乐于助人的助手,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,同一空白的答案可能有多个,请在同一个 string 中用逗号隔开表示,如 ['sqrt(x) 且 x > 10', '1/2, 1/3', '1/4'] 代表问题包含三小问,第一小问包含取值范围信息,第二小问有两个答案,第三小问有一个答案。 +对于解答题,类似填空题,答案应该是一个答案列表,每小问的答案间用逗号隔开,同样需要注意某些小问答案多个的情况。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +""" # noqa E501 + +POST_PROMPT_EN=""" +You are a helpful assistant whose task is to extract precise key answers from given response sentences. You must only provide the extracted key answers without any additional text. +— +I will provide you with a question, a response sentence, and the question type. The response sentence is a reply to the provided question. Using the provided information, you must accurately and precisely identify and extract the expected key answers from the response sentence. Please do not provide subjective opinions about the question. + +For multiple-choice questions, the answer should be the letter of the option, such as "A". +For multiple-answer questions, the answer should be a list of option letters, such as ["A"] or ["A", "B", "C"]. +For fill-in-the-blank questions, the answer should be a list of answers to fill in the blanks. The number of items in the list should match the number of blanks in the question. If there are multiple answers for the same blank, separate them with a comma within the same string, like ['sqrt(x) and x > 10', '1/2, 1/3', '1/4'], which represents three sub-questions where the first sub-question includes a range, the second sub-question has two answers, and the third sub-question has one answer. +For problem-solving questions, similar to fill-in-the-blank questions, the answer should be a list of answers. Separate answers for different sub-questions with commas, and note that some sub-questions may have multiple answers. + +If the response sentence provides multiple different answers, carefully determine whether a later provided answer is a correction or modification of an earlier answer. If so, extract this corrected or modified answer as the final answer. Conversely, if the response sentence fluctuates between multiple answers without a clear final answer, you should output [No valid answer]. +— +Question type: {question_type} +Question: {question} +Output sentences: {response} +Key extracted answer: +""" # noqa E501 + def extract_boxed_answer(text): match = re.findall(r'\\boxed{(.+?)}', text) @@ -57,7 +92,15 @@ api_meta_template = dict(round=[ @ICL_EVALUATORS.register_module() class GaoKaoMATHEvaluator(BaseEvaluator): - def __init__(self, model_name, url, **kwargs): + def __init__(self, + model_name, + url, + question_type=None, + language='en', + with_postprocess=False, + post_url=[], + post_model_name='', + **kwargs): if isinstance(url, str): url = [url] @@ -68,31 +111,60 @@ class GaoKaoMATHEvaluator(BaseEvaluator): path=model_name, openai_api_base=url, key='EMPTY', - query_per_second=1, + query_per_second=2, meta_template=api_meta_template, - temperature=kwargs.get('temperature', 0.01), + temperature=kwargs.get('temperature', 1e-6), max_seq_len=kwargs.get('max_tokens', 8192), )) for url in url ] + self.question_type = question_type + self.language = language + self.with_postprocess = with_postprocess + self.post_url = post_url + self.post_model_name = post_model_name - def batch_response(self, inputs): - batch_num = len(self.model) + def batch_response(self, models, inputs): + batch_num = len(models) batch_size = (len(inputs) + batch_num - 1) // batch_num result_responses = [] with concurrent.futures.ThreadPoolExecutor( max_workers=batch_num) as executor: futures = [ - executor.submit(self.model[i].generate, + executor.submit(models[i].generate, inputs[i * batch_size:(i + 1) * batch_size]) for i in range(batch_num) ] for response in executor.map(lambda f: f.result(), futures): result_responses.extend(response) - return result_responses - def score(self, predictions, references, origin_prompt): + def postprocess(self, questions, predictions, question_type='None'): + self.post_model = [ + MODELS.build( + dict( + type=OpenAISDK, + path=self.post_model_name, + openai_api_base=url, + key='EMPTY', + query_per_second=2, + meta_template=api_meta_template, + temperature=1e-6, + max_seq_len=1024, + )) for url in self.post_url + ] + input_prompts = [] + prompt = POST_PROMPT_EN if self.language == 'en' else POST_PROMPT_CN + for question, response, question_type in zip(questions, predictions, + question_type): + input_prompts.append( + prompt.format(question=question, + response=response, + question_type=question_type)) + result_responses = self.batch_response(self.post_model, input_prompts) + return result_responses + + def score(self, predictions, references, origin_prompt, test_set): if len(predictions) != len(references): return {'error': 'preds and refrs have different length'} questions = [item[0]['prompt'] for item in origin_prompt] @@ -100,13 +172,29 @@ class GaoKaoMATHEvaluator(BaseEvaluator): correct = 0 details = [] results = [] + + if self.with_postprocess: + if self.question_type: + self.question_type = [self.question_type] * len(questions) + # test_set type is huggingface Dataset + elif 'question_type' in test_set.column_names: + self.question_type = test_set['question_type'] + else: + self.question_type = ['问答题'] * len( + questions) if self.language == 'cn' else [ + 'problem-solving' + ] * len(questions) + + predictions = self.postprocess(questions, predictions, + self.question_type) + inputs = [] for pred, ref, ques in zip(predictions, references, questions): inputs.append( EVAL_PROMPT.format(answer=pred, gold_answer=ref, question=ques)) + result_responses = self.batch_response(self.model, inputs) - result_responses = self.batch_response(inputs) results = [ extract_boxed_answer(result) == 'yes' for result in result_responses @@ -132,13 +220,3 @@ class GaoKaoMATHEvaluator(BaseEvaluator): } return detailed_result - - -if __name__ == '__main__': - evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1', - temperature=0.01, - max_tokens=2048, - procs=8) - predictions = ['1', '2', '3'] - references = ['1', '2', '3'] - evaluator.score(predictions, references) diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py index 13690ad0..fa0336ee 100644 --- a/opencompass/utils/model_postprocessors.py +++ b/opencompass/utils/model_postprocessors.py @@ -24,7 +24,7 @@ def gen_output_naive(ori_data, extractor): @TEXT_POSTPROCESSORS.register_module('naive') -def navie_model_postprocess(preds: list, +def naive_model_postprocess(preds: list, model_name: str, custom_instruction: str, api_url: Union[str, list], From 5c1916ea4caccf8dad875682ac10c1e6d8efc5c8 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Mon, 25 Nov 2024 03:35:27 -0800 Subject: [PATCH 02/19] [Update] Add RULER 64k config (#1709) --- configs/datasets/ruler/ruler_64k_gen.py | 28 +++++++++++++++++++ configs/datasets/ruler/ruler_combined_gen.py | 1 + configs/summarizers/groups/ruler.py | 2 +- configs/summarizers/ruler.py | 8 +++++- .../configs/datasets/ruler/ruler_64k_gen.py | 28 +++++++++++++++++++ .../datasets/ruler/ruler_combined_gen.py | 1 + .../configs/summarizers/groups/ruler.py | 2 +- opencompass/configs/summarizers/ruler.py | 8 +++++- 8 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 configs/datasets/ruler/ruler_64k_gen.py create mode 100644 opencompass/configs/datasets/ruler/ruler_64k_gen.py diff --git a/configs/datasets/ruler/ruler_64k_gen.py b/configs/datasets/ruler/ruler_64k_gen.py new file mode 100644 index 00000000..709260d6 --- /dev/null +++ b/configs/datasets/ruler/ruler_64k_gen.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 64] +abbr_suffixs: list[str] = ['64k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/configs/datasets/ruler/ruler_combined_gen.py b/configs/datasets/ruler/ruler_combined_gen.py index 0b9ebe19..077c4f10 100644 --- a/configs/datasets/ruler/ruler_combined_gen.py +++ b/configs/datasets/ruler/ruler_combined_gen.py @@ -6,6 +6,7 @@ with read_base(): from .ruler_8k_gen import ruler_datasets as ruler_8k_ds from .ruler_16k_gen import ruler_datasets as ruler_16k_ds from .ruler_32k_gen import ruler_datasets as ruler_32k_ds + from .ruler_64k_gen import ruler_datasets as ruler_64k_ds from .ruler_128k_gen import ruler_datasets as ruler_128k_ds ruler_combined_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) diff --git a/configs/summarizers/groups/ruler.py b/configs/summarizers/groups/ruler.py index 3bac0414..49a76567 100644 --- a/configs/summarizers/groups/ruler.py +++ b/configs/summarizers/groups/ruler.py @@ -13,7 +13,7 @@ default_ruler_tasks = [ 'ruler_qa_squad', 'ruler_qa_hotpotqa', ] -context_window_sizes = ['4k', '8k', '16k', '32k', '128k', '1m'] +context_window_sizes = ['4k', '8k', '16k', '32k', '64k', '128k', '1m'] ruler_summary_groups = [] for context_window_size in context_window_sizes: diff --git a/configs/summarizers/ruler.py b/configs/summarizers/ruler.py index 90da3e4c..cb35ac2e 100644 --- a/configs/summarizers/ruler.py +++ b/configs/summarizers/ruler.py @@ -35,7 +35,12 @@ ruler_32k_summarizer = dict( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ), ) - +ruler_64k_summarizer = dict( + dataset_abbrs=['ruler_64k'], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) ruler_128k_summarizer = dict( dataset_abbrs=['ruler_128k'], summary_groups=sum( @@ -56,6 +61,7 @@ ruler_combined_summarizer = dict( 'ruler_8k', 'ruler_16k', 'ruler_32k', + 'ruler_64k', 'ruler_128k', 'ruler_1m', ], diff --git a/opencompass/configs/datasets/ruler/ruler_64k_gen.py b/opencompass/configs/datasets/ruler/ruler_64k_gen.py new file mode 100644 index 00000000..709260d6 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_64k_gen.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 64] +abbr_suffixs: list[str] = ['64k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_combined_gen.py b/opencompass/configs/datasets/ruler/ruler_combined_gen.py index 0b9ebe19..077c4f10 100644 --- a/opencompass/configs/datasets/ruler/ruler_combined_gen.py +++ b/opencompass/configs/datasets/ruler/ruler_combined_gen.py @@ -6,6 +6,7 @@ with read_base(): from .ruler_8k_gen import ruler_datasets as ruler_8k_ds from .ruler_16k_gen import ruler_datasets as ruler_16k_ds from .ruler_32k_gen import ruler_datasets as ruler_32k_ds + from .ruler_64k_gen import ruler_datasets as ruler_64k_ds from .ruler_128k_gen import ruler_datasets as ruler_128k_ds ruler_combined_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) diff --git a/opencompass/configs/summarizers/groups/ruler.py b/opencompass/configs/summarizers/groups/ruler.py index 3bac0414..49a76567 100644 --- a/opencompass/configs/summarizers/groups/ruler.py +++ b/opencompass/configs/summarizers/groups/ruler.py @@ -13,7 +13,7 @@ default_ruler_tasks = [ 'ruler_qa_squad', 'ruler_qa_hotpotqa', ] -context_window_sizes = ['4k', '8k', '16k', '32k', '128k', '1m'] +context_window_sizes = ['4k', '8k', '16k', '32k', '64k', '128k', '1m'] ruler_summary_groups = [] for context_window_size in context_window_sizes: diff --git a/opencompass/configs/summarizers/ruler.py b/opencompass/configs/summarizers/ruler.py index 90da3e4c..cb35ac2e 100644 --- a/opencompass/configs/summarizers/ruler.py +++ b/opencompass/configs/summarizers/ruler.py @@ -35,7 +35,12 @@ ruler_32k_summarizer = dict( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ), ) - +ruler_64k_summarizer = dict( + dataset_abbrs=['ruler_64k'], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) ruler_128k_summarizer = dict( dataset_abbrs=['ruler_128k'], summary_groups=sum( @@ -56,6 +61,7 @@ ruler_combined_summarizer = dict( 'ruler_8k', 'ruler_16k', 'ruler_32k', + 'ruler_64k', 'ruler_128k', 'ruler_1m', ], From 300adc31e86bd70f57356b069badc7246996f8a1 Mon Sep 17 00:00:00 2001 From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:11:27 +0800 Subject: [PATCH 03/19] [Feature] Add Korbench dataset (#1713) * first version for korbench * first stage for korbench * korbench_1 * korbench_1 * korbench_1 * korbench_1 * korbench_1_revised * korbench_combined_1 * korbench_combined_1 * kor_combined * kor_combined * update --------- Co-authored-by: MaiziXiao --- configs/eval_korbench.py | 9 + .../korbench/korbench_mixed_gen_d00bdd.py | 59 ++ .../korbench/korbench_single_0_shot_gen.py | 60 ++ .../korbench/korbench_single_3_shot_gen.py | 61 ++ .../configs/summarizers/groups/korbench.py | 5 + opencompass/datasets/__init__.py | 1 + opencompass/datasets/korbench/korbench.py | 215 ++++++ .../korbench_dataset_config/config.yaml | 15 + .../korbench_dataset_config/config_wrapper.py | 90 +++ .../prompt/0_shot.yaml | 94 +++ .../prompt/3_shot.yaml | 184 +++++ .../korbench_dataset_config/prompt/mixed.yaml | 22 + .../prompt/self-correction.yaml | 3 + .../korbench_dataset_config/prompt/trick.yaml | 20 + .../datasets/korbench/korbench_utils.py | 699 ++++++++++++++++++ .../icl_evaluator/icl_korbench_evaluator.py | 267 +++++++ opencompass/utils/datasets_info.py | 10 + 17 files changed, 1814 insertions(+) create mode 100644 configs/eval_korbench.py create mode 100644 opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py create mode 100644 opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py create mode 100644 opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py create mode 100644 opencompass/configs/summarizers/groups/korbench.py create mode 100644 opencompass/datasets/korbench/korbench.py create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/config.yaml create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/config_wrapper.py create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/0_shot.yaml create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/3_shot.yaml create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/mixed.yaml create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/self-correction.yaml create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/trick.yaml create mode 100644 opencompass/datasets/korbench/korbench_utils.py create mode 100644 opencompass/openicl/icl_evaluator/icl_korbench_evaluator.py diff --git a/configs/eval_korbench.py b/configs/eval_korbench.py new file mode 100644 index 00000000..91851c12 --- /dev/null +++ b/configs/eval_korbench.py @@ -0,0 +1,9 @@ +from mmengine import read_base + +with read_base(): + from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import korbench_0shot_single_datasets as zero_shot_datasets + from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import korbench_3shot_single_datasets as three_shot_datasets + from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import korbench_mixed_datasets as mixed_datasets + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import models as hf_internlm2_5_7b +datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets +models = hf_internlm2_5_7b diff --git a/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py new file mode 100644 index 00000000..6447dfe3 --- /dev/null +++ b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py @@ -0,0 +1,59 @@ +from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +korbench_mixed_datasets = [] + +categories = ["Multi-Q", "Multi-R", "Multi-RQ"] # Define available modes for mixed mode + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role="HUMAN", + prompt="" + ) + ], + round=[ + dict( + role="HUMAN", + prompt="{prompt}" # f-string + ) + ] + ) + ) + + # Reader configuration + reader_cfg = dict( + input_columns=["prompt"], + output_column="answer", + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict(type=korbenchEvaluator), + pred_role="BOT", + ) + + korbench_dataset = dict( + type=korbenchDataset, + abbr=f"korbench_mixed_{category}", + path="opencompass/korbench", + category=category, + mode='mixed', + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + ) + + korbench_mixed_datasets.append(korbench_dataset) \ No newline at end of file diff --git a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py new file mode 100644 index 00000000..d04c9f60 --- /dev/null +++ b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py @@ -0,0 +1,60 @@ +from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +categories = ["cipher", "counterfactual", "logic", "operation", "puzzle"] + +korbench_0shot_single_datasets = [] + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role="HUMAN", + prompt="" + ) + ], + round=[ + dict( + role="HUMAN", + prompt="{prompt}" # f-string + ) + ] + ) + ) + + # Reader configuration + reader_cfg = dict( + input_columns=["prompt"], + output_column="answer", + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict(type=korbenchEvaluator), + pred_role="BOT", + ) + + korbench_dataset = dict( + type=korbenchDataset, + abbr=f"korbench_{category}_0shot", + path="opencompass/korbench", + mode='0_shot', + category=category, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + ) + + korbench_0shot_single_datasets.append(korbench_dataset) diff --git a/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py new file mode 100644 index 00000000..0d70f5f8 --- /dev/null +++ b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py @@ -0,0 +1,61 @@ +from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator + +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +categories = ["cipher", "counterfactual", "logic", "operation", "puzzle"] + +korbench_3shot_single_datasets = [] + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role="HUMAN", + prompt="" + ) + ], + round=[ + dict( + role="HUMAN", + prompt="{prompt}" # f-string + ) + ] + ) + ) + + # Reader configuration + reader_cfg = dict( + input_columns=["prompt"], + output_column="answer", + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict(type=korbenchEvaluator), + pred_role="BOT", + ) + + korbench_dataset = dict( + type=korbenchDataset, + abbr=f"korbench_{category}_3shot", + path="opencompass/korbench", + mode='3_shot', + category=category, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + ) + + korbench_3shot_single_datasets.append(korbench_dataset) diff --git a/opencompass/configs/summarizers/groups/korbench.py b/opencompass/configs/summarizers/groups/korbench.py new file mode 100644 index 00000000..101fd65d --- /dev/null +++ b/opencompass/configs/summarizers/groups/korbench.py @@ -0,0 +1,5 @@ +korbench_summary_groups = [] +categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle'] +mixed_categories = ['Multi-Q', 'Multi-R', 'Multi-RQ'] +korbench_summary_groups.append({'name': 'korbench_single', 'subsets': [f'korbench_{c}' for c in categories]}) +korbench_summary_groups.append({'name': 'korbench_mixed', 'subsets': [f'korbench_{c}' for c in mixed_categories]}) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index e96ffc28..ddb70b12 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -65,6 +65,7 @@ from .iwslt2017 import * # noqa: F401, F403 from .jigsawmultilingual import * # noqa: F401, F403 from .jsonl import JsonlDataset # noqa: F401, F403 from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403 +from .korbench import * # noqa: F401, F403 from .lambada import * # noqa: F401, F403 from .lawbench import * # noqa: F401, F403 from .LCBench import * # noqa: F401, F403 diff --git a/opencompass/datasets/korbench/korbench.py b/opencompass/datasets/korbench/korbench.py new file mode 100644 index 00000000..b0f649e1 --- /dev/null +++ b/opencompass/datasets/korbench/korbench.py @@ -0,0 +1,215 @@ +import os + +from datasets import Dataset + +from opencompass.datasets.korbench.korbench_utils import ( + evaluate_responses, find_file, load_json_or_jsonl, + load_json_or_jsonl_with_idx, load_yaml) +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset + + +@LOAD_DATASET.register_module() +class korbenchDataset(BaseDataset): + """Dataset loader for the task in KOR-Bench.""" + + @staticmethod + def load(path, mode, category): + """Load the dataset using shared .""" + base_path = get_data_path(path) + rule_file = None + sample_file = None + mixed_file = None + mixed_data = None + if '0_shot' in mode or '3_shot' in mode: + rule_file = find_file(base_path, os.path.join(category, 'rule')) + sample_file = find_file(base_path, + os.path.join(category, 'sample')) + elif mode == 'mixed': + mixed_file = find_file(base_path, os.path.join('mixed', category)) + mixed_data = load_json_or_jsonl(mixed_file) or [] + else: + raise ValueError(f'Unsupported mode: {mode}') + three_shot_file = None + if mode == '3_shot': + ts_path = os.path.join(category, 'three-shot') + three_shot_file = find_file(base_path, ts_path) + # Load data + if mode in ['0_shot', '3_shot']: + rules = load_json_or_jsonl(rule_file) or [] + samples = load_json_or_jsonl(sample_file) or [] + template_path = None + if mode == '0_shot': + template_path = os.path.join( + os.path.dirname(__file__), + 'korbench_dataset_config/prompt/0_shot.yaml') + elif mode == '3_shot': + template_path = os.path.join( + os.path.dirname(__file__), + 'korbench_dataset_config/prompt/3_shot.yaml') + elif mode == 'mixed': + template_path = os.path.join( + os.path.dirname(__file__), + 'korbench_dataset_config/prompt/mixed.yaml') + try: + template = load_yaml(template_path) + except FileNotFoundError: + print(f'[ERROR] Missing prompt template: {template_path}') + return Dataset.from_list([]) + + # Process data + data = [] + if mode == '0_shot': + for sample in samples: + rule_id = sample['rule_id'] + rule = next((r for r in rules if r['idx'] == rule_id), None) + if not rule: + print(f"[WARNING] Rule ID {sample['rule_id']} not found." + 'Skipping...') + continue + prompt_key = f'{category}_prompt_format' + prompt = template[prompt_key][0].format( + rule['rule_content'], sample['question']) + + # Add processed item + data.append({ + 'rule_content': rule['rule_content'], + 'question': sample['question'], + 'answer': sample['answer'], + 'prompt': prompt, + 'rule_id': rule['idx'], + 'mode': '0_shot', + 'category': category, + }) + + return Dataset.from_list(data) + + if mode == '3_shot': + data = [] + three_shot = load_json_or_jsonl(three_shot_file) or [] + for sample in samples: + rule_id = sample['rule_id'] + rule = next((r for r in rules if r['idx'] == rule_id), None) + three_shot_qa = [ + item for fs in three_shot if fs['rule_id'] == rule_id + for item in [fs['question'], fs['answer']] + ] + if not rule: + print(f"[WARNING] Rule ID {sample['rule_id']} not found." + 'Skipping...') + continue + prompt_key = f'{category}_prompt_format' + prompt = template[prompt_key][0].format( + rule['rule_content'], *three_shot_qa, sample['question']) + # Add processed item + data.append({ + 'rule_content': rule['rule_content'], + 'question': sample['question'], + 'answer': sample['answer'], + 'prompt': prompt, + 'rule_id': rule['idx'], + 'mode': '3_shot', + 'category': category, + }) + + return Dataset.from_list(data) + + if mode == 'mixed': + # Process data + data = [] + for item in mixed_data: + rule_list = item['rule_list'] + question_list = item['question_list'] + rule_content_list = [] + question_content_list = [] + + # Fetch rules and questions + for rule in rule_list: + category, rule_idx = rule.rsplit('_', 1) + rule_content = load_json_or_jsonl_with_idx(base_path, + os.path.join( + category, + 'rule'), + idx=rule_idx) + rule_content_list.append(rule_content['rule_content']) + + for question in question_list: + category, question_idx = question.rsplit('_', 1) + question_content = load_json_or_jsonl_with_idx( + base_path, + os.path.join(category, 'sample'), + idx=question_idx) + question_content_list.append(question_content['question']) + + # Prepare prompt + rules_str = '\n'.join( + f'Rule {i+1}: {content}' + for i, content in enumerate(rule_content_list)) + questions_str = '\n'.join( + f'Question {i+1}: {content}' + for i, content in enumerate(question_content_list)) + prompt_format = [rules_str, questions_str] + prompt = template['prompt_format'][0].format(*prompt_format) + + # Add processed item + data.append({ + 'rule_list': rule_list, + 'question_list': question_list, + 'prompt': prompt, + 'mode': 'mixed', + 'answer': '', + 'base_path': base_path, + }) + + return Dataset.from_list(data) + + +@ICL_EVALUATORS.register_module() +class korbenchEvaluator(BaseEvaluator): + + def __init__(self): + super().__init__() + + def score(self, predictions, references, test_set): + """Evaluate predictions for a single mode in KOR-Bench.""" + if not test_set: + raise ValueError('Test set is empty.') + + mode = test_set[0]['mode'] # Determine the mode from the first entry + data = {} + + # Organize data for the given mode + for i in range(len(predictions)): + entry = { + 'prediction': predictions[i], + 'gold': references[i], + 'rule_id': test_set[i].get('rule_id', None), + 'category': test_set[i].get('category', None), + 'rule_list': test_set[i].get('rule_list', None), + 'question_list': test_set[i].get('question_list', None), + 'base_path': test_set[i].get('base_path', None), + } + data[i] = entry + + if not data: + raise ValueError(f"No data found for mode '{mode}'") + + # Evaluate based on the mode + if mode == '0_shot': + evaluation_results = evaluate_responses(data, '0_shot') + elif mode == '3_shot': + evaluation_results = evaluate_responses(data, '3_shot') + elif mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']: + evaluation_results = evaluate_responses(data, 'mixed', + test_set[0]['base_path']) + else: + raise ValueError(f'Unsupported mode: {mode}') + # Calculate accuracy + correct_count = sum(res['is_correct'] for res in evaluation_results) + accuracy = (correct_count / len(evaluation_results)) * 100 + + # Return scores + return {'accuracy': accuracy} diff --git a/opencompass/datasets/korbench/korbench_dataset_config/config.yaml b/opencompass/datasets/korbench/korbench_dataset_config/config.yaml new file mode 100644 index 00000000..c9e8bef0 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/config.yaml @@ -0,0 +1,15 @@ +# Necessary +response_key: 'response' +error_key: 'error' +id_key: + - 'idx' + - 'step' +prompt_key: 'prompt' + +# Optional +history_key: 'history' +status_key: 'status' + +save_prompt: True +max_tokens: 2000 +max_rounds: 5 diff --git a/opencompass/datasets/korbench/korbench_dataset_config/config_wrapper.py b/opencompass/datasets/korbench/korbench_dataset_config/config_wrapper.py new file mode 100644 index 00000000..13c2caa7 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/config_wrapper.py @@ -0,0 +1,90 @@ +import yaml + + +class ConfigWrapper: + + def __init__(self, config_path): + self._config = {} + with open(config_path, 'r') as file: + self._config = yaml.safe_load(file) + for key, value in self._config.items(): + setattr(self, key, value) + + def __setattr__(self, key, value): + if key.startswith('_'): + super().__setattr__(key, value) + else: + self._config[key] = value + super().__setattr__(key, value) + + def __getattr__(self, key): + if key in self._config: + return self._config[key] + raise AttributeError( + f"'ConfigWrapper' object has no attribute '{key}'") + + def get_id(self, data): + if isinstance(self._config.get('id_key'), str): + return data.get(self._config.get('id_key'), None) + elif isinstance(self._config.get('id_key'), list): + return '_'.join([ + str(data[key]) for key in self._config.get('id_key') + if key in data + ]) + + def print_all_keys(self): + print('config keys:') + for key, value in self._config.items(): + print(f' - {key}: {value}') + + +config_wrapper = None + + +def initialize_config(config_path): + global config_wrapper + config_wrapper = ConfigWrapper(config_path) + + +def get_config_wrapper(): + global config_wrapper + if config_wrapper is None: + raise RuntimeError( + 'ConfigWrapper not initialized. Call initialize_config first.') + return config_wrapper + + +if __name__ == '__main__': + config_path = 'config/config.yaml' + initialize_config(config_path) + data = { + 'idx': + '50', + 'step': + 21, + 'question': + ('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n' + 'Please provide the decrypted answer, ' + 'encapsulated in double square brackets. ' + 'For example, the format should be: [[decrypted answer]].'), + 'answer': + '[[P]]', + 'category': + 'Decryption', + 'rule_id': + '23', + 'input': + 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', + 'steps_num': + 23, + 'description': + ('For a number c=228 in the ciphertext:\nCalculate z = c^e mod n.' + ' Here ^ means multiplication.\nz is 80.\n' + 'Based on the decimal number represented by z, ' + 'use the ascii code to find the corresponding' + ' letter as the plaintext letter p.\n' + 'Please give the letter p in [[...]] format.\n'), + 'atom': + 80 + } + print(config_wrapper.get_id(data)) diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/0_shot.yaml b/opencompass/datasets/korbench/korbench_dataset_config/prompt/0_shot.yaml new file mode 100644 index 00000000..1caa4144 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/prompt/0_shot.yaml @@ -0,0 +1,94 @@ +cipher_prompt_format: + - | + You are an intelligent assistant that specializes in encryption and decryption tasks. Below are the rules for a specific cipher. When responding, please ensure that your output adheres to the specified encryption and decryption rules and format. + + ### Instructions: + + 1. Identify the relevant properties and objects specified in the rule, including the plaintext, keyword, and ciphertext. + 2. Follow the specified encryption or decryption operations precisely as described in the rules. + 3. Ensure your output is formatted according to the specified notation and symbols. + + ### Cipher Rule: + + {} + + ### Question: + {} + + ### Answer: + +counterfactual_prompt_format: + - | + You are an advanced assistant with expertise in storytelling and rule-based reasoning. Your task is to carefully analyze the provided story, which includes specific rules and details, and use this information to accurately answer related questions. + + ### Instructions: + + 1. Thoroughly review the story to identify and understand the relevant details and rules. + 2. Use the context provided by the story to offer precise and insightful answers. + 3. Ensure your responses align with the rules and information given in the story. + + ### Story Rule: + + {} + + ### Question: + {} + + ### Answer: + +logic_prompt_format: + - | + You are an intelligent assistant that helps with various logical reasoning tasks. Below is a custom-defined rule for a specific type of logic. When responding, please ensure that your output adheres to the specified logical rules and format. + + ### Instructions: + + 1. Identify the relevant properties and objects as specified in the rule. + 2. Apply the given logical operations or reasoning patterns. + 3. Ensure your output is formatted according to the specified notation and symbols. + + ### Logic Rule: + + {} + + ### Question: + {} + + ### Answer: + +operation_prompt_format: + - | + You are an intelligent assistant specializing in evaluating custom operations. Below is a specific rule defined for a custom operation. Your task is to apply this rule accurately to the provided question. + + ### Instructions: + + 1. Carefully read and understand the definitions of the new operations in the rule. + 2. If the question does not specifically ask for it, your answer should be a number or a group of numbers. + 3. Double-check your final answer to ensure it follows the rule accurately. + + ### Operation Rule: + + {} + + ### Question: + {} + + ### Answer: + +puzzle_prompt_format: + - | + You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question. + + ### Instructions: + + 1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps. + 2. Apply the rule carefully to address the question presented. + 3. Verify your answer to ensure it aligns with the rule and the context of the puzzle. + + ### Puzzle Rule: + + {} + + ### Question: + {} + + ### Answer: diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/3_shot.yaml b/opencompass/datasets/korbench/korbench_dataset_config/prompt/3_shot.yaml new file mode 100644 index 00000000..5de1e6b5 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/prompt/3_shot.yaml @@ -0,0 +1,184 @@ +cipher_prompt_format: + - | + You are an intelligent assistant that specializes in encryption and decryption tasks. Below are the rules for a specific cipher. When responding, please ensure that your output adheres to the specified encryption and decryption rules and format. + + ### Instructions: + + 1. Identify the relevant properties and objects specified in the rule, including the plaintext, keyword, and ciphertext. + 2. Follow the specified encryption or decryption operations precisely as described in the rules. + 3. Ensure your output is formatted according to the specified notation and symbols. + + ### Cipher Rule: + + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + +counterfactual_prompt_format: + - | + You are an advanced assistant with expertise in storytelling and rule-based reasoning. Your task is to carefully analyze the provided story, which includes specific rules and details, and use this information to accurately answer related questions. + + ### Instructions: + + 1. Thoroughly review the story to identify and understand the relevant details and rules. + 2. Use the context provided by the story to offer precise and insightful answers. + 3. Ensure your responses align with the rules and information given in the story. + + ### Story Rule: + + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + +logic_prompt_format: + - | + You are an intelligent assistant that helps with various logical reasoning tasks. Below is a custom-defined rule for a specific type of logic. When responding, please ensure that your output adheres to the specified logical rules and format. + + ### Instructions: + + 1. Identify the relevant properties and objects as specified in the rule. + 2. Apply the given logical operations or reasoning patterns. + 3. Ensure your output is formatted according to the specified notation and symbols. + + ### Logic Rule: + + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + +operation_prompt_format: + - | + You are an intelligent assistant specializing in evaluating custom operations. Below is a specific rule defined for a custom operation. Your task is to apply this rule accurately to the provided question. + + ### Instructions: + + 1. Carefully read and understand the definitions of the new operations in the rule. + 2. If the question does not specifically ask for it, your answer should be a number or a group of numbers. + 3. Double-check your final answer to ensure it follows the rule accurately. + + ### Operation Rule: + + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + +puzzle_prompt_format: + - | + You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question. + + ### Instructions: + + 1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps. + 2. Apply the rule carefully to address the question presented. + 3. Verify your answer to ensure it aligns with the rule and the context of the puzzle. + + ### Puzzle Rule: + + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: + {} + + ### Question: + {} + + ### Answer: diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/mixed.yaml b/opencompass/datasets/korbench/korbench_dataset_config/prompt/mixed.yaml new file mode 100644 index 00000000..d13cbb5b --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/prompt/mixed.yaml @@ -0,0 +1,22 @@ +prompt_format: + - | + You are an intelligent assistant capable of handling all types of reasoning and problem-solving tasks. Below is the text of a set of rules. Your task is to apply the appropriate rules to solve a series of problems. + + ### Instructions: + 1. Read each question carefully and rules to find something relevant to that question. + 2. Use the relevant rules to answer each question accurately. + 3. Provide the final answers to all questions in JSON format. + {{ + "question1": "your answer", + "question2": "your answer", + "question3": "your answer", + }} + + ### Rules: + + {} + + ### Questions: + {} + + ### Answers: diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/self-correction.yaml b/opencompass/datasets/korbench/korbench_dataset_config/prompt/self-correction.yaml new file mode 100644 index 00000000..36a7b0a9 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/prompt/self-correction.yaml @@ -0,0 +1,3 @@ +prompt_format: + - | + Your answer is incorrect, please check your answer and provide a correct one. diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/trick.yaml b/opencompass/datasets/korbench/korbench_dataset_config/prompt/trick.yaml new file mode 100644 index 00000000..a415c916 --- /dev/null +++ b/opencompass/datasets/korbench/korbench_dataset_config/prompt/trick.yaml @@ -0,0 +1,20 @@ +prompt_format: + - | + You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question. + + ### Instructions: + + 1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps. + 2. Apply the rule carefully to address the question presented. + 3. Verify your answer to ensure it aligns with the rule and the context of the puzzle. + + ### Puzzle Rule: + + {} + + ### Question: + {} + + {} + + ### Answer: diff --git a/opencompass/datasets/korbench/korbench_utils.py b/opencompass/datasets/korbench/korbench_utils.py new file mode 100644 index 00000000..8b59766a --- /dev/null +++ b/opencompass/datasets/korbench/korbench_utils.py @@ -0,0 +1,699 @@ +import json +import os +import re + +import sympy as sp +import yaml +from sympy.parsing.latex import parse_latex + + +def load_yaml(yaml_path): + """Load a YAML file.""" + if not os.path.exists(yaml_path): + raise FileNotFoundError(f'YAML file not found: {yaml_path}') + with open(yaml_path, 'r', encoding='utf-8') as file: + return yaml.safe_load(file) + + +def load_json_or_jsonl(file_path): + """Load data from a JSON or JSONL file.""" + if not os.path.exists(file_path): + return None + with open(file_path, 'r', encoding='utf-8') as file: + if file_path.endswith('.json'): + return json.load(file) + elif file_path.endswith('.jsonl'): + return [json.loads(line) for line in file] + return None + + +def find_file(base_path, sub_path, extensions=('json', 'jsonl')): + """Find the first available file with given extensions.""" + for ext in extensions: + file_path = os.path.join(base_path, f'{sub_path}.{ext}') + if os.path.exists(file_path): + return file_path + return None + + +def load_json_or_jsonl_with_idx(data_path, split='', idx=None): + base_path = os.path.join(data_path, split) + if os.path.exists(f'{base_path}.json'): + file_path = f'{base_path}.json' + elif os.path.exists(f'{base_path}.jsonl'): + file_path = f'{base_path}.jsonl' + elif base_path.endswith('.json') or base_path.endswith('.jsonl'): + file_path = base_path + else: + raise FileNotFoundError('No JSON or JSONL file found.') + + with open(file_path, 'r', encoding='utf-8') as file: + if file_path.endswith('.json'): + data = json.load(file) + elif file_path.endswith('.jsonl'): + data = [json.loads(line) for line in file] + + if idx is not None: + try: + return next(item for item in data if item.get('idx') == idx) + except StopIteration: + raise ValueError(f'No entry found for idx {idx}') + else: + return data + + +def load_split_data(base_path, split_name): + """Load the rule and sample data for a specific split.""" + split_path = os.path.join(base_path, split_name) + rule_path = find_file(split_path, 'rule') + sample_path = find_file(split_path, 'sample') + + rules = load_json_or_jsonl(rule_path) if rule_path else [] + samples = load_json_or_jsonl(sample_path) if sample_path else [] + + return {'rules': rules, 'samples': samples} + + +def process_mixed_data(base_path, mode): + """Load and process data for the 'mixed' split and specific mode.""" + mixed_path = os.path.join(base_path, 'mixed') + file_path = find_file(mixed_path, mode) + if not file_path: + print(f'[WARNING] Missing file for mixed mode: {mode}') + return [] + + data = load_json_or_jsonl(file_path) + template_path = os.path.join(base_path, 'config/prompt/mixed.yaml') + template = load_yaml(template_path) + + processed = [] + for item in data: + rules = '\n'.join(item.get('rule_list', [])) + questions = '\n'.join(item.get('question_list', [])) + item['prompt'] = template['prompt_format'][0].format(rules, questions) + processed.append(item) + + return processed + + +class ConfigWrapper: + + def __init__(self, config_path): + self._config = {} + with open(config_path, 'r') as file: + self._config = yaml.safe_load(file) + for key, value in self._config.items(): + setattr(self, key, value) + + def __setattr__(self, key, value): + if key.startswith('_'): + super().__setattr__(key, value) + else: + self._config[key] = value + super().__setattr__(key, value) + + def __getattr__(self, key): + if key in self._config: + return self._config[key] + raise AttributeError( + f"'ConfigWrapper' object has no attribute '{key}'") + + def get_id(self, data): + if isinstance(self._config.get('id_key'), str): + return data.get(self._config.get('id_key'), None) + elif isinstance(self._config.get('id_key'), list): + return '_'.join([ + str(data[key]) for key in self._config.get('id_key') + if key in data + ]) + + def print_all_keys(self): + print('config keys:') + for key, value in self._config.items(): + print(f' - {key}: {value}') + + +config_wrapper = None + + +def initialize_config(config_path): + global config_wrapper + config_wrapper = ConfigWrapper(config_path) + + +def get_config_wrapper(): + global config_wrapper + if config_wrapper is None: + raise RuntimeError( + 'ConfigWrapper not initialized. Call initialize_config first.') + return config_wrapper + + +if __name__ == '__main__': + config_path = 'config/config.yaml' + initialize_config(config_path) + data = { + 'idx': + '50', + 'step': + 21, + 'question': + ('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n' + 'Please provide the decrypted answer, encapsulated in double ' + 'square brackets. ' + 'For example, the format should be: [[decrypted answer]].'), + 'answer': + '[[P]]', + 'category': + 'Decryption', + 'rule_id': + '23', + 'input': + 'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"', + 'steps_num': + 23, + 'description': + ('For a number c=228 in the ciphertext:\n' + 'Calculate z = c^e mod n. Here ^ means multiplication.\n' + 'z is 80.\nBased on the decimal number represented by z, ' + 'use the ascii code to find the corresponding letter ' + 'as the plaintext letter p.\n' + 'Please give the letter p in [[...]] format.\n'), + 'atom': + 80 + } + print(config_wrapper.get_id(data)) + + +def read_yaml(config='default'): + if os.path.exists(f'config/prompt/{config}.yaml'): + yaml_file = f'config/prompt/{config}.yaml' + else: + yaml_file = config + with open(yaml_file, 'r') as yaml_file: + return yaml.safe_load(yaml_file) + + +def write_jsonl_lines(file, data): + config_wrapper = get_config_wrapper() + if config_wrapper.save_prompt: + json.dump(data, file, ensure_ascii=False) + else: + data.pop(config_wrapper.prompt_key) + json.dump(data, file, ensure_ascii=False) + file.write('\n') + file.flush() + + +def print_info(info): + print('-' * 100) + print('[INFO] model_name:', info['model_name']) + print('[INFO] splits:', info['splits']) + print('[INFO] modes:', info['modes']) + print('[INFO] output_dir:', info['output_dir']) + print('[INFO] Infer Limit:', + 'No limit' if info['infer_limit'] is None else info['infer_limit']) + print('[INFO] Number of Workers:', info['num_workers']) + print('[INFO] Batch Size:', info['batch_size']) + print('[INFO] Use Accel:', info['use_accel']) + print('-' * 100) + + +def read_json_or_jsonl(data_path, split='', mapping_key=None): + base_path = os.path.join(data_path, split) + if os.path.exists(f'{base_path}.json'): + file_path = f'{base_path}.json' + elif os.path.exists(f'{base_path}.jsonl'): + file_path = f'{base_path}.jsonl' + elif base_path.endswith('.json') or base_path.endswith('.jsonl'): + file_path = base_path + else: + raise FileNotFoundError('No JSON or JSONL file found.') + + with open(file_path, 'r') as file: + if file_path.endswith('.json'): + data = json.load(file) + elif file_path.endswith('.jsonl'): + data = [json.loads(line) for line in file] + + if mapping_key: + return { + item[mapping_key]: item + for item in data if mapping_key in item + } + else: + return data + + +def read_json_or_jsonl_with_idx(data_path, split='', idx=None): + base_path = os.path.join(data_path, split) + if os.path.exists(f'{base_path}.json'): + file_path = f'{base_path}.json' + elif os.path.exists(f'{base_path}.jsonl'): + file_path = f'{base_path}.jsonl' + elif base_path.endswith('.json') or base_path.endswith('.jsonl'): + file_path = base_path + else: + raise FileNotFoundError('No JSON or JSONL file found.') + + with open(file_path, 'r', encoding='utf-8') as file: + if file_path.endswith('.json'): + data = json.load(file) + elif file_path.endswith('.jsonl'): + data = [json.loads(line) for line in file] + + if idx is not None: + try: + return next(item for item in data if item.get('idx') == idx) + except StopIteration: + raise ValueError(f'No entry found for idx {idx}') + else: + return data + + +idx_ranges = [ + [18], + [73, 74, 77], + [94], + [115, 116, 117], + [121, 122, 123, 125], + [131, 132, 134, 135, 136], + [141, 143, 149], + list(range(145, 148)), + list(range(151, 157)), + [160, 161, 162], + [164, 165, 166], + [170], + [206, 209], + list(range(211, 216)), + [217, 218], +] + + +def clean_json_string(json_str): + json_str = re.sub(r'[\x00-\x1F\x7F]', '', json_str) + return json_str + + +def is_in_idx_ranges(idx, idx_ranges): + for range_list in idx_ranges: + if int(idx) in range_list: + return True + return False + + +def extract_json(text): + matches = re.findall(r'{.*}', text, re.DOTALL) + if matches: + json_str = matches[-1] + json_str = clean_json_string(json_str) + try: + data = json.loads(json_str) + return data + except json.JSONDecodeError as e: + print(f'Error decoding JSON: {e}') + return 'NULL' + return 'NULL' + + +def extract_all_responses_from_json(response_json): + results = [] + for key, value in response_json.items(): + results.append(str(value)) + return results + + +def clean_latex(latex_expr): + if '=' in latex_expr: + latex_expr = latex_expr.rsplit('=', 1)[1] + latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr) + latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr) + latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr) + latex_expr = latex_expr.replace('\\\\', '\\') + return latex_expr + + +def extract_text_from_brackets(text, clean_level='basic'): + matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL) + if not matches: + matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL) + if not matches: + matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL) + if matches: + match_str = matches[0].strip() + if clean_level == 'clean': + match_str = match_str.replace('"', '').replace('\n', '').replace( + ' ', '').replace('[', '').replace(']', '') + elif clean_level == 'logic': + match_str = match_str.replace('"', '').replace('\n', '').replace( + ' ', '').replace('.', '') + elif clean_level == 'math': + match_str = match_str.replace('"', '').replace('\n', '').replace( + '[', '').replace(']', '').replace('$', '') + return f'{clean_latex(match_str)}' + return f'[[{match_str}]]' + return 'NULL' + + +def extract_inner_text_from_brackets(text): + if not isinstance(text, str): + print(f'text type: {type(text)}, text value: {text}') + return 'NULL' + match = re.search(r'\[\[(.*?)\]\]', text, re.DOTALL) + return match.group(1) if match else 'NULL' + + +def extract_numbers(str): + numbers = re.findall(r'\d+', str) + numbers = list(map(int, numbers)) + return numbers + + +def extract_and_sort_inequalities(latex_expr): + pattern = r'(≥|≤)\s*([-]?\d+\.?\d*)' + matches = re.findall(pattern, latex_expr) + extracted_inequalities = [''.join(match) for match in matches] + sorted_inequalities = sorted(extracted_inequalities) + return sorted_inequalities + + +def rule5_normalize_content(content): + parts = [part for part in content.split(';')] + sorted_parts = sorted(parts) + return sorted_parts + + +def normalize_string(s): + s = re.sub(r'[^0-9]', '', s) + pairs = s.split(',') + pairs.sort() + return pairs + + +def remove_commas_and_spaces(s): + return re.sub(r'[,\s\[\]]+', '', s) + + +def remove_non_alphanumeric(s): + return re.sub(r'\W+', '', s) + + +def contains_or(answer): + return 'or' in answer + + +def compare_multi_results(response, answer): + try: + response_text = extract_text_from_brackets(response, 'clean') + response_text = re.sub(r'\\text\{or\}', 'or', response_text) + if response_text == 'NULL': + return False + answer = extract_text_from_brackets(answer, 'clean') + response_split = response_text.strip('[[]]').split('or') + answer_split = answer.strip('[[]]').split('or') + response_sorted = sorted([x.strip() for x in response_split]) + answer_sorted = sorted([x.strip() for x in answer_split]) + return response_sorted == answer_sorted + except Exception as e: + print(f'Error during comparison: {e}') + return False + + +def split_or_expression(expression): + return [part.strip() for part in expression.split('or')] + + +def compare_math_expressions(response, answer): + response_text = extract_text_from_brackets(response, 'math') + answer_text = extract_text_from_brackets(answer, 'math') + if response_text == 'NULL': + return False + if contains_or(answer_text): + response_parts = split_or_expression(response_text) + answer_parts = split_or_expression(answer_text) + try: + response_exprs = { + sp.simplify(parse_latex(part)) + for part in response_parts + } + answer_exprs = { + sp.simplify(parse_latex(part)) + for part in answer_parts + } + return response_exprs == answer_exprs + except Exception as e: + print(f'Error during simplification or parsing: {e}') + return response_text == answer_text + else: + try: + response_expr = sp.simplify(parse_latex(response_text)) + answer_expr = sp.simplify(parse_latex(answer_text)) + return response_expr == answer_expr + except Exception as e: + print(f'Error during simplification or parsing: {e}') + return response_text == answer_text + + +def method_equal(response_text, answer): + return response_text == answer + + +def method_1(response_text, answer): + cleaned_string = re.sub(r'[^A-Za-z]', '', response_text) + cleaned_string = cleaned_string.lower() + answer = re.sub(r'[^A-Za-z]', '', answer) + answer = answer.lower() + return cleaned_string == answer + + +def method_2(response_text, answer): + cleaned_string = re.sub(r'[^A-Za-z]', '', response_text) + cleaned_string = cleaned_string.lower() + answer = answer.split(',') + return cleaned_string in answer + + +def method_3(response_text, answer): + response_text = response_text.lower() + pairs1 = re.split(r'\W+', response_text) + pairs2 = answer.split(' ') + pairs1 = [word for word in pairs1 if word] + pairs1.sort() + pairs2.sort() + return pairs1 == pairs2 + + +def method_4(response_text, answer): + cleaned_string = re.sub(r'[^A-Za-z]', '', response_text) + cleaned_string = cleaned_string.lower() + return cleaned_string in answer + + +def method_5(response_text, answer): + response_text = re.sub(r'\s+', '', response_text) + response_text = response_text.split(',') + answer = answer.split(',') + response_text.sort() + answer.sort() + return response_text == answer + + +def method_9(response_text, answer): + response_text = response_text.replace('×', '*').replace('−', '-') + answer = answer.replace('×', '*').replace('−', '-') + + def extract_operators(s): + return re.findall(r'[+\-*/]', s) + + response_ops = extract_operators(response_text.split('=')[0]) + answer_ops = extract_operators(answer.split('=')[0]) + if response_ops != answer_ops: + return False + match = re.search(r'=\s*(-?\d+)', answer) + expected_result = int(match.group(1)) + try: + left_side = response_text.split('=')[0] + result = eval(left_side) + except Exception as e: + print(f'Error during evaluation: {e}') + return False + return result == expected_result + + +def method_10(response_text, answer): + response_text = response_text.replace('×', '*').replace('−', '-') + response_text = response_text.split('=')[0] + answer = answer.split('\n')[0].split('=')[0] + response_ops = sorted(remove_non_alphanumeric(response_text)) + answer_ops = sorted(remove_non_alphanumeric(answer)) + if response_ops != answer_ops: + return False + try: + result = eval(response_text) + except Exception as e: + print(f'Error during evaluation: {e}') + return False + return result == 24 + + +def method_18(response_text, answer): + cleaned_s1 = remove_commas_and_spaces(response_text) + cleaned_s2 = remove_commas_and_spaces(answer) + return cleaned_s1 == cleaned_s2 + + +def method_general(response_text, answer): + cleaned_s1 = remove_non_alphanumeric(response_text) + cleaned_s2 = remove_non_alphanumeric(answer) + return cleaned_s1 == cleaned_s2 + + +question_methods = { + '1': method_1, + '2': method_2, + '3': method_3, + '4': method_4, + '5': method_5, + '9': method_9, + '10': method_10, + '18': method_18, +} + + +def evaluate_response_vs_answer(response, answer, question_type, rule_id, idx): + if question_type == 'logic' and rule_id == '5': + response_text = extract_text_from_brackets(response, 'logic') + answer_text = extract_text_from_brackets(answer, 'logic') + if response_text is None: + return False + normalized_response = rule5_normalize_content(response_text) + normalized_answer = rule5_normalize_content(answer) + return normalized_response == normalized_answer + elif question_type == 'logic': + response_text = extract_text_from_brackets(response, 'logic') + answer_text = extract_text_from_brackets(answer, 'logic') + return response_text == answer_text + elif question_type == 'operation' and (idx == '178' or idx == '179'): + response_text = extract_text_from_brackets(response, 'clean') + response_text = extract_and_sort_inequalities(response_text) + answer_text = extract_and_sort_inequalities(answer) + # print(response_text, answer_text) + return response_text == answer_text + elif question_type == 'operation' and rule_id == '18': + response_text = extract_text_from_brackets(response, 'clean') + answer = extract_inner_text_from_brackets(answer) + response_text = ''.join(sorted(re.sub(r'\W+', '', response_text))) + answer = ''.join(sorted(re.sub(r'\W+', '', answer))) + return response_text == answer + elif question_type == 'operation' and rule_id in {'23', '24', '25'}: + response_text = extract_text_from_brackets(response, 'clean') + if response_text is None: + return False + response_text = extract_numbers(response_text) + answer_text = extract_numbers(answer) + return response_text == answer_text + elif question_type == 'operation' and is_in_idx_ranges(idx, idx_ranges): + return compare_math_expressions(response, answer) + elif question_type == 'operation' and contains_or(answer): + return compare_multi_results(response, answer) + elif question_type == 'puzzle': + response_text = extract_inner_text_from_brackets(response) + answer = extract_inner_text_from_brackets(answer) + method = question_methods.get(rule_id) + if method: + return method(response_text, answer) + return method_general(response_text, answer) + else: + response_text = extract_text_from_brackets(response, 'clean') + return response_text == answer + + +def compute_one_mixed_question_pass_rate(idx, + question_list, + response_json, + base_path=None): + if response_json == 'NULL': + result_dict = { + 'idx': idx, + 'response': response_json, + 'details': None, + 'pass_rate': 0, + 'is_correct': False + } + return result_dict + response_list = extract_all_responses_from_json(response_json) + correct_num = 0 + results = [] + for q_idx, question in enumerate(question_list): + category, question_idx = question.rsplit('_', 1) + question_content = load_json_or_jsonl_with_idx(base_path, + os.path.join( + category, 'sample'), + idx=question_idx) + answer = question_content['answer'] + if q_idx >= len(response_list): + break + response = response_list[q_idx] + response_text = extract_text_from_brackets(response) + rule_id = question_content['rule_id'] + is_correct = evaluate_response_vs_answer(response, answer, category, + rule_id, q_idx) + if is_correct: + correct_num += 1 + results.append({ + 'question': question, + 'response_text': response_text, + 'answer': answer, + 'is_correct': is_correct + }) + + pass_rate = correct_num / len(question_list) + question_correct = pass_rate == 1.0 + result_dict = { + 'idx': idx, + 'response': response_json, + 'details': results, + 'pass_rate': pass_rate, + 'is_correct': question_correct + } + return result_dict + + +def evaluate_responses(data, mode, base_path=None): + results = [] + + # Iterate over the values of the dictionary (numerical keys) + for key, record in data.items(): + idx = key # Use the dictionary key as the "idx" + response = record.get('prediction', '') + question_type = record.get('category', '') + if mode == 'mixed': + question_list = record.get('question_list') + response_json = extract_json(response) + result_dict = compute_one_mixed_question_pass_rate( + idx, question_list, response_json, base_path) + results.append(result_dict) + else: + response_text = extract_text_from_brackets(response) + answer = record.get('gold', '') + rule_id = record.get('rule_id', '') + is_correct = evaluate_response_vs_answer(response, answer, + question_type, rule_id, + idx) + result_dict = { + 'idx': idx, + 'response': response, + 'response_text': response_text, + 'answer': answer, + 'is_correct': is_correct + } + if question_type == 'counterfactual': + real_life_answer = record.get('real_life_answer', '') + is_real_life = evaluate_response_vs_answer( + response, real_life_answer, question_type, rule_id, idx) + result_dict['real_life_answer'] = real_life_answer + result_dict['is_real_life'] = is_real_life + if question_type == 'cipher' and mode == 'subquestions': + result_dict['type'] = record.get('type', '') + results.append(result_dict) + return results diff --git a/opencompass/openicl/icl_evaluator/icl_korbench_evaluator.py b/opencompass/openicl/icl_evaluator/icl_korbench_evaluator.py new file mode 100644 index 00000000..f51ca40f --- /dev/null +++ b/opencompass/openicl/icl_evaluator/icl_korbench_evaluator.py @@ -0,0 +1,267 @@ +# flake8: noqa +"""KOR-Bench Evaluator.""" + +import json +import os +import re + +from .icl_base_evaluator import BaseEvaluator + + +def read_json_or_jsonl(data_path, split='', mapping_key=None): + base_path = os.path.join(data_path, split) + if os.path.exists(f'{base_path}.json'): + file_path = f'{base_path}.json' + elif os.path.exists(f'{base_path}.jsonl'): + file_path = f'{base_path}.jsonl' + elif base_path.endswith('.json') or base_path.endswith('.jsonl'): + file_path = base_path + else: + raise FileNotFoundError('No JSON or JSONL file found.') + + with open(file_path, 'r') as file: + if file_path.endswith('.json'): + data = json.load(file) + elif file_path.endswith('.jsonl'): + data = [json.loads(line) for line in file] + + if mapping_key: + return { + item[mapping_key]: item + for item in data if mapping_key in item + } + else: + return data + + +def read_json_or_jsonl_with_idx(data_path, split='', idx=None): + base_path = os.path.join(data_path, split) + if os.path.exists(f'{base_path}.json'): + file_path = f'{base_path}.json' + elif os.path.exists(f'{base_path}.jsonl'): + file_path = f'{base_path}.jsonl' + elif base_path.endswith('.json') or base_path.endswith('.jsonl'): + file_path = base_path + else: + raise FileNotFoundError('No JSON or JSONL file found.') + + with open(file_path, 'r', encoding='utf-8') as file: + if file_path.endswith('.json'): + data = json.load(file) + elif file_path.endswith('.jsonl'): + data = [json.loads(line) for line in file] + + if idx is not None: + try: + return next(item for item in data if item.get('idx') == idx) + except StopIteration: + raise ValueError(f'No entry found for idx {idx}') + else: + return data + + +class korbenchEvaluator(BaseEvaluator): + """Evaluator class for KOR-Bench tasks, inheriting from BaseEvaluator. + + This class implements the `score` method to evaluate the model's + predictions against the reference answers, using the evaluation logic + specific to KOR-Bench. + """ + + def __init__(self, question_type, mode): + """Initialize the evaluator with question type and mode. + + Args: + question_type (str): Type of questions (e.g., 'logic', 'operation', 'puzzle'). + mode (str): Evaluation mode (e.g., 'zero-shot', 'self-correction'). + """ + super().__init__() + self.question_type = question_type + self.mode = mode + + # Predefined index ranges for special evaluation cases + self.idx_ranges = [ + [18], + [73, 74, 77], + [94], + [115, 116, 117], + [121, 122, 123, 125], + [131, 132, 134, 135, 136], + [141, 143, 149], + list(range(145, 148)), + list(range(151, 157)), + [160, 161, 162], + [164, 165, 166], + [170], + [206, 209], + list(range(211, 216)), + [217, 218], + ] + + def score(self, predictions, references): + """Evaluates the model's predictions against the references. + + Args: + predictions (list): List of model predictions. + references (list): List of reference answers (each reference is a dict). + + Returns: + list: Evaluation results for each prediction. + """ + if len(predictions) != len(references): + return { + 'error': 'Predictions and references have different lengths' + } + + data = [] + for idx, (prediction, + reference) in enumerate(zip(predictions, references)): + record = { + 'idx': str(idx), + 'response': prediction, + 'answer': reference.get('answer'), + 'rule_id': reference.get('rule_id'), + 'question_type': self.question_type, + # Include other necessary fields from reference if needed + } + data.append(record) + + results = self.evaluate_responses(data, self.question_type, self.mode) + return results + + def evaluate_responses(self, data, question_type, mode): + """Evaluates a list of responses. + + Args: + data (list): List of records containing responses and answers. + question_type (str): Type of questions. + mode (str): Evaluation mode. + + Returns: + list: List of evaluation results. + """ + results = [] + for record in data: + idx = record.get('idx') + response = record.get('response') + answer = record.get('answer') + rule_id = record.get('rule_id') + + response_text = self.extract_text_from_brackets(response) + is_correct = self.evaluate_response_vs_answer( + response, answer, question_type, rule_id, idx) + + result_dict = { + 'idx': idx, + 'response': response, + 'response_text': response_text, + 'answer': answer, + 'is_correct': is_correct + } + results.append(result_dict) + return results + + # Helper methods + + def extract_text_from_brackets(self, text, clean_level='basic'): + """Extracts text enclosed in double brackets [[ ]]. + + Args: + text (str): The text to extract from. + clean_level (str): The level of cleaning to perform. + + Returns: + str: The extracted text or "NULL" if not found. + """ + matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL) + if not matches: + matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL) + if not matches: + matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL) + if matches: + match_str = matches[0].strip() + if clean_level == 'clean': + match_str = match_str.replace('"', '').replace( + '\n', '').replace(' ', '').replace('[', + '').replace(']', '') + elif clean_level == 'logic': + match_str = match_str.replace('"', + '').replace('\n', '').replace( + ' ', '').replace('.', '') + elif clean_level == 'math': + match_str = match_str.replace('"', '').replace( + '\n', '').replace('[', '').replace(']', + '').replace('$', '') + return f'{self.clean_latex(match_str)}' + return f'[[{match_str}]]' + return 'NULL' + + def clean_latex(self, latex_expr): + """Cleans LaTeX expressions for parsing. + + Args: + latex_expr (str): The LaTeX expression to clean. + + Returns: + str: The cleaned expression. + """ + if '=' in latex_expr: + latex_expr = latex_expr.rsplit('=', 1)[1] + latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr) + latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr) + latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr) + latex_expr = latex_expr.replace('\\\\', '\\') + return latex_expr + + def evaluate_response_vs_answer(self, response, answer, question_type, + rule_id, idx): + """Evaluates a single response against the answer. + + Args: + response (str): The model's response. + answer (str): The reference answer. + question_type (str): The question type. + rule_id (str): The rule ID. + idx (str): The index of the question. + + Returns: + bool: True if the response is correct, False otherwise. + """ + if question_type == 'logic' and rule_id == '5': + response_text = self.extract_text_from_brackets(response, 'logic') + answer_text = self.extract_text_from_brackets(answer, 'logic') + if response_text is None: + return False + normalized_response = self.rule5_normalize_content(response_text) + normalized_answer = self.rule5_normalize_content(answer) + return normalized_response == normalized_answer + elif question_type == 'logic': + response_text = self.extract_text_from_brackets(response, 'logic') + answer_text = self.extract_text_from_brackets(answer, 'logic') + return response_text == answer_text + else: + response_text = self.extract_text_from_brackets(response, 'clean') + return response_text == answer + + def rule5_normalize_content(self, content): + """Normalizes content for rule 5. + + Args: + content (str): The content to normalize. + + Returns: + list: Sorted list of content parts. + """ + parts = [part.strip() for part in content.split(';')] + sorted_parts = sorted(parts) + return sorted_parts + + # Additional helper methods can be defined here + # For example: methods to handle mathematical expressions, logic comparisons, etc. + + # Implement other helper functions as per your evaluation logic + + +# Example usage: +# evaluator = korbenchEvaluator(question_type='logic', mode='zero-shot') +# results = evaluator.score(predictions, references) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 7d694ff1..8fe89971 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -151,6 +151,12 @@ DATASETS_MAPPING = { "hf_id": "opencompass/humaneval", "local": "./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl", }, + #KORBENCH + "opencompass/korbench": { + "ms_id": "", + "hf_id": "", + "local": "./data/korbench", + }, # Lambada "opencompass/lambada": { "ms_id": "opencompass/lambada", @@ -544,4 +550,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", "md5": "e400864c31bc58d29eaa3e199751f99b", }, + "/korbench": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", + "md5": "9107597d137e7362eaf7d218ddef7a6d", + }, } From f97c4eae42dca6e56aedf9b7b663ce8299c0f6b7 Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Tue, 26 Nov 2024 14:26:55 +0800 Subject: [PATCH 04/19] [Update] Update Fullbench (#1712) * Update JuderBench * Support O1-style Prompts * Update Code --- .../subjective/judgerbench/judgerbench.py | 5 - .../aime2024_0shot_nocot_gen_2b9dc2.py | 39 +++++ .../bbh/bbh_0shot_nocot_gen_9c32f6.py | 96 ++++++++++ .../bbh/bbh_0shot_nocot_gen_ea7952.py | 96 ++++++++++ .../cmo_fib_0shot_notcot_gen_4c6c29.py | 39 +++++ .../gpqa/gpqa_0shot_nocot_gen_772ea0.py | 52 ++++++ .../gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py | 37 ++++ ...val_plus_openai_simple_evals_gen_159614.py | 38 ++++ .../humanevalx_0shot_nocot_gen_3e4bbd.py | 40 +++++ .../livecodebench_o1_gen_f0ed6c.py | 165 ++++++++++++++++++ .../livecodebench_split_v4_o1_gen_f0ed6c.py | 165 ++++++++++++++++++ .../livecodebench_v1_o1_gen_f0ed6c.py | 164 +++++++++++++++++ ...math_prm800k_500_0shot_nocot_gen_b27274.py | 36 ++++ ...00k_500_0shot_nocot_llmjudge_gen_63a000.py | 85 +++++++++ ...zed_mbpp_mdblock_0shot_nocot_gen_a2e416.py | 40 +++++ .../subjective/judgerbench/judgerbench.py | 5 - .../datasets/livecodebench/__init__.py | 1 + .../datasets/livecodebench/evaluator.py | 8 +- .../datasets/livecodebench/livecodebench.py | 36 +++- opencompass/models/base.py | 1 + opencompass/runners/volc.py | 9 +- opencompass/utils/datasets_info.py | 4 + 22 files changed, 1147 insertions(+), 14 deletions(-) create mode 100644 opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py create mode 100644 opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py create mode 100644 opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py create mode 100644 opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py create mode 100644 opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py create mode 100644 opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py create mode 100644 opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py create mode 100644 opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py create mode 100644 opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py diff --git a/configs/datasets/subjective/judgerbench/judgerbench.py b/configs/datasets/subjective/judgerbench/judgerbench.py index 3c436585..e6aafb12 100644 --- a/configs/datasets/subjective/judgerbench/judgerbench.py +++ b/configs/datasets/subjective/judgerbench/judgerbench.py @@ -47,8 +47,3 @@ for _name in subjective_all_sets: infer_cfg=subjective_infer_cfg, eval_cfg=subjective_eval_cfg, )) -# ds1000_eval_cfg = dict( -# evaluator=dict(type=DS1000Evaluator), -# pred_role='BOT', -# pred_postprocessor=dict(type=ds1000_postprocess), -# ) diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py new file mode 100644 index 00000000..0a1790e0 --- /dev/null +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2 + + +aime2024_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +aime2024_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +aime2024_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +aime2024_datasets = [ + dict( + abbr='aime2024', + type=Aime2024Dataset, + path='opencompass/aime2024', + reader_cfg=aime2024_reader_cfg, + infer_cfg=aime2024_infer_cfg, + eval_cfg=aime2024_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py new file mode 100644 index 00000000..586d0107 --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py @@ -0,0 +1,96 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=BBHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py new file mode 100644 index 00000000..61ea50dd --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py @@ -0,0 +1,96 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=BBHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py new file mode 100644 index 00000000..39b08adf --- /dev/null +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2 + + +cmo_fib_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +cmo_fib_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +cmo_fib_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +cmo_fib_datasets = [ + dict( + abbr='cmo_fib', + type=CMOFibDataset, + path='opencompass/cmo_fib', + reader_cfg=cmo_fib_reader_cfg, + infer_cfg=cmo_fib_infer_cfg, + eval_cfg=cmo_fib_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py b/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py new file mode 100644 index 00000000..4783dae4 --- /dev/null +++ b/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator + +# openai_simple_eval prompt +align_prompt = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +gpqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=align_prompt), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator), + pred_postprocessor=dict(type=GPQA_Simple_Eval_postprocess)) + +gpqa_datasets = [] +gpqa_subsets = { + # 'extended': 'gpqa_extended.csv', + # 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py new file mode 100644 index 00000000..f1b7fe30 --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), +) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py new file mode 100644 index 00000000..8e91abdc --- /dev/null +++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py @@ -0,0 +1,38 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}' + ), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalPlusEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py b/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py new file mode 100644 index 00000000..b4d85fad --- /dev/null +++ b/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator + +humanevalx_reader_cfg = dict( + input_columns=['prompt'], output_column='declaration', train_split='test') + +humanevalx_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +humanevalx_eval_cfg_dict = { + lang : dict( + evaluator=dict( + type=HumanevalXEvaluator, + language=lang, + ip_address= + 'localhost', # replace to your code_eval_server ip_address, port + port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server + pred_role='BOT') + for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now +} + +# Please download the needed `xx.jsonl.gz` from +# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx +# and move them into `data/humanevalx/` folder +humanevalx_datasets = [ + dict( + type=HumanevalXDataset, + abbr=f'humanevalx-{lang}', + language=lang, + path='./data/humanevalx', + reader_cfg=humanevalx_reader_cfg, + infer_cfg=humanevalx_infer_cfg, + eval_cfg=humanevalx_eval_cfg_dict[lang]) + for lang in ['python', 'cpp', 'go', 'java', 'js'] +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py new file mode 100644 index 00000000..b74e2c76 --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py new file mode 100644 index 00000000..cd65c3ae --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_split_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_split_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_split_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py new file mode 100644 index 00000000..f2a0d77e --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v1', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v1', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py new file mode 100644 index 00000000..c1d20a80 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py new file mode 100644 index 00000000..eabc6c68 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py @@ -0,0 +1,85 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +# from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + + +## Model api url +# xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +# naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_name = 'dlc_model' +# naive_model_url = [ +# 'http://172.30.56.38:23001/v1', +# ] # Multi-apis for accerlation +naive_model_url = [ + "http://172.30.56.38:23001/v1", + "http://172.30.8.4:23003/v1", + "http://172.30.8.14:23002/v1", + "http://172.30.48.80:23004/v1", + "http://172.30.56.132:23005/v1", + "http://172.30.16.115:23006/v1", + "http://172.30.48.82:23007/v1", + "http://172.30.24.53:23008/v1", + "http://172.30.56.141:23009/v1", + "http://172.30.8.35:23010/v1", + "http://172.30.48.85:23011/v1", + "http://172.30.16.116:23012/v1" +] +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192), +) + + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) + + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + judge_model_name=naive_model_name, + url=naive_model_url, + ) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=evaluator, + pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py b/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py new file mode 100644 index 00000000..17005593 --- /dev/null +++ b/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + # dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + # dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + # dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',), + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n You should submit your final solution in the following format: ```python\n\n```',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py b/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py index 3c436585..e6aafb12 100644 --- a/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py +++ b/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py @@ -47,8 +47,3 @@ for _name in subjective_all_sets: infer_cfg=subjective_infer_cfg, eval_cfg=subjective_eval_cfg, )) -# ds1000_eval_cfg = dict( -# evaluator=dict(type=DS1000Evaluator), -# pred_role='BOT', -# pred_postprocessor=dict(type=ds1000_postprocess), -# ) diff --git a/opencompass/datasets/livecodebench/__init__.py b/opencompass/datasets/livecodebench/__init__.py index 5c7a048d..4870e556 100644 --- a/opencompass/datasets/livecodebench/__init__.py +++ b/opencompass/datasets/livecodebench/__init__.py @@ -1,6 +1,7 @@ from .evaluator import LCBCodeExecutionEvaluator # noqa: F401, F403 from .evaluator import LCBCodeGenerationEvaluator # noqa: F401, F403 from .evaluator import LCBTestOutputEvaluator # noqa: F401, F403 +from .livecodebench import CompassBenchCodeExecutionDataset # noqa: F401, F403 from .livecodebench import LCBCodeExecutionDataset # noqa: F401, F403 from .livecodebench import LCBCodeGenerationDataset # noqa: F401, F403 from .livecodebench import LCBTestOutputPredictionDataset # noqa: F401, F403 diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py index 9ae936a8..cb6d2360 100644 --- a/opencompass/datasets/livecodebench/evaluator.py +++ b/opencompass/datasets/livecodebench/evaluator.py @@ -228,11 +228,15 @@ def codegen_metrics( @ICL_EVALUATORS.register_module() class LCBCodeGenerationEvaluator(BaseEvaluator): - def __init__(self, num_process_evaluate, timeout=6): + def __init__(self, + num_process_evaluate, + timeout=6, + release_version='release_v1'): super().__init__() self.num_process_evaluate = num_process_evaluate self.timeout = timeout - self.dataset = LCBCodeGenerationDataset.load()['test'] + self.dataset = LCBCodeGenerationDataset.load( + release_version=release_version)['test'] def score(self, predictions, references): predictions = [[extract_code_generation(item)] for item in predictions] diff --git a/opencompass/datasets/livecodebench/livecodebench.py b/opencompass/datasets/livecodebench/livecodebench.py index aa7e82ef..dbd76d71 100644 --- a/opencompass/datasets/livecodebench/livecodebench.py +++ b/opencompass/datasets/livecodebench/livecodebench.py @@ -8,7 +8,7 @@ import zlib from dataclasses import dataclass from enum import Enum -from datasets import DatasetDict, load_dataset +from datasets import DatasetDict, load_dataset, load_from_disk from opencompass.utils import get_data_path # noqa: F401, F403 @@ -215,3 +215,37 @@ class LCBSelfRepairDataset(BaseDataset): dataset = dataset.map(transform) return DatasetDict({'test': dataset, 'train': dataset}) + + +class CompassBenchCodeExecutionDataset(BaseDataset): + + @staticmethod + def load( + path: str = 'opencompass/execution-v2', + local_mode: bool = False, + cot: bool = False, + # release_version: str = "release_v1" + ): + # path = get_data_path(path, local_mode=local_mode) + + def transform(item): + code, input = item['code'], item['input'] + prompt = make_code_execution_prompt(code, input, cot=cot) + + item['prompt'] = prompt + + evaluation_sample = json.dumps({ + 'code': item['code'], + 'input': item['input'], + 'output': item['output'] + }) + item['evaluation_sample'] = evaluation_sample + + return item + + path = get_data_path(path, local_mode=local_mode) + dataset = load_from_disk(path) # 'livecodebench/execution-v2' + dataset = dataset['test'] + dataset = dataset.map(transform) + + return DatasetDict({'test': dataset, 'train': dataset}) diff --git a/opencompass/models/base.py b/opencompass/models/base.py index 9e983f39..eb8e298d 100644 --- a/opencompass/models/base.py +++ b/opencompass/models/base.py @@ -390,6 +390,7 @@ class LMTemplateParser: elif item.get('prompt', ''): # it's a dict prompt += last_sep + item.get('prompt', '') last_sep = '\n' + return prompt def _split_rounds( diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index 81d0c869..d48f7a43 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -176,7 +176,7 @@ class VOLCRunner(BaseRunner): cmd = get_cmd() logger = get_logger() - logger.debug(f'Running command: {cmd}') + logger.info(f'Running command: {cmd}') out_path = task.get_log_path(file_extension='txt') mmengine.mkdir_or_exist(osp.split(out_path)[0]) @@ -205,10 +205,17 @@ class VOLCRunner(BaseRunner): return task_name, returncode def _run_task(self, cmd, log_path, poll_interval): + logger = get_logger() result = subprocess.run(cmd, shell=True, text=True, capture_output=True) + + logger.info(f'Command output: {result.stdout}') + if result.stderr: + logger.error(f'Command error: {result.stderr}') + logger.info(f'Return code: {result.returncode}') + pattern = r'(?<=task_id=).*(?=\n\n)' match = re.search(pattern, result.stdout) if match: diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 8fe89971..21e640c8 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -554,4 +554,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", "md5": "9107597d137e7362eaf7d218ddef7a6d", }, + "subjective/judgerbench": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", + "md5": "60d605883aa8cac9755819140ab42c6b" + } } From ef695e28e56ac4ffa439d5b49169425953dcdab0 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Tue, 26 Nov 2024 17:13:28 +0800 Subject: [PATCH 05/19] [Bug] Fix Korbench dataset module (#1717) --- .../deepseek/lmdeploy_deepseek_v2_lite.py | 20 +++++++++++++++++++ opencompass/datasets/korbench/__init__.py | 0 2 files changed, 20 insertions(+) create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py create mode 100644 opencompass/datasets/korbench/__init__.py diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py new file mode 100644 index 00000000..bd67b684 --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-v2_lite-turbomind', + path='deepseek-ai/DeepSeek-V2-Lite-Chat', + engine_config=dict( + session_len=7168, + max_batch_size=4, + tp=2, + cache_max_entry_count=0.7, + ), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), + max_seq_len=7168, + max_out_len=2048, + batch_size=4, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/datasets/korbench/__init__.py b/opencompass/datasets/korbench/__init__.py new file mode 100644 index 00000000..e69de29b From bcb707dbfc5fee3d0cd38bc1afd5d4c570f81275 Mon Sep 17 00:00:00 2001 From: Yi Ding Date: Tue, 26 Nov 2024 19:24:47 +0800 Subject: [PATCH 06/19] [Fix] Fix BailingAPI model (#1707) * [fix] sequence under the multiple samples * resolve the lint problems * change the parameter name * add another error code for retry * output the log for invalid response * format correction * update * update * update * update * add two model python files * update the default parameter * use random for delay * update the api example of bailing * remove the unnecessary parameter --- configs/api_examples/eval_api_bailing.py | 14 +++- ...iling-pro-0920.py => bailing-lite-1116.py} | 8 +- .../models/bailing_api/bailing-pro-1120.py | 8 +- .../models/bailing_api/bailing-lite-1116.py | 8 +- ...iling-lite-0830.py => bailing-pro-1120.py} | 8 +- opencompass/models/bailing_api_oc.py | 76 +++++++++---------- 6 files changed, 60 insertions(+), 62 deletions(-) rename configs/models/bailing_api/{bailing-pro-0920.py => bailing-lite-1116.py} (80%) rename opencompass/configs/models/bailing_api/bailing-pro-0920.py => configs/models/bailing_api/bailing-pro-1120.py (80%) rename configs/models/bailing_api/bailing-lite-0830.py => opencompass/configs/models/bailing_api/bailing-lite-1116.py (80%) rename opencompass/configs/models/bailing_api/{bailing-lite-0830.py => bailing-pro-1120.py} (80%) diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py index 00640fb4..030d1502 100644 --- a/configs/api_examples/eval_api_bailing.py +++ b/configs/api_examples/eval_api_bailing.py @@ -15,13 +15,19 @@ datasets = [ models = [ dict( - path='Bailing-Lite-0830', + path='Bailing-Lite-1116', token='xxxxxx', # set your key here or in environment variable BAILING_API_KEY url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, - generation_kwargs={}, - query_per_second=1, - max_seq_len=4096, + max_out_len=11264, + batch_size=1, + generation_kwargs={ + 'temperature': 0.01, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + }, ), ] diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-lite-1116.py similarity index 80% rename from configs/models/bailing_api/bailing-pro-0920.py rename to configs/models/bailing_api/bailing-lite-1116.py index db69b263..106c3b73 100644 --- a/configs/models/bailing_api/bailing-pro-0920.py +++ b/configs/models/bailing_api/bailing-lite-1116.py @@ -10,21 +10,19 @@ api_meta_template = dict( models = [ dict( - path='Bailing-Pro-0920', + path='Bailing-Lite-1116', token='', # set your key here or in environment variable BAILING_API_KEY url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, - query_per_second=1, - max_seq_len=4096, + max_out_len=11264, batch_size=1, generation_kwargs={ - 'temperature': 0.4, + 'temperature': 0.01, 'top_p': 1.0, 'top_k': -1, 'n': 1, 'logprobs': 1, - 'use_beam_search': False, }, ), ] diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-1120.py similarity index 80% rename from opencompass/configs/models/bailing_api/bailing-pro-0920.py rename to configs/models/bailing_api/bailing-pro-1120.py index db69b263..cee6c3df 100644 --- a/opencompass/configs/models/bailing_api/bailing-pro-0920.py +++ b/configs/models/bailing_api/bailing-pro-1120.py @@ -10,21 +10,19 @@ api_meta_template = dict( models = [ dict( - path='Bailing-Pro-0920', + path='Bailing-Pro-1120', token='', # set your key here or in environment variable BAILING_API_KEY url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, - query_per_second=1, - max_seq_len=4096, + max_out_len=11264, batch_size=1, generation_kwargs={ - 'temperature': 0.4, + 'temperature': 0.01, 'top_p': 1.0, 'top_k': -1, 'n': 1, 'logprobs': 1, - 'use_beam_search': False, }, ), ] diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-1116.py similarity index 80% rename from configs/models/bailing_api/bailing-lite-0830.py rename to opencompass/configs/models/bailing_api/bailing-lite-1116.py index 88053ce9..106c3b73 100644 --- a/configs/models/bailing_api/bailing-lite-0830.py +++ b/opencompass/configs/models/bailing_api/bailing-lite-1116.py @@ -10,21 +10,19 @@ api_meta_template = dict( models = [ dict( - path='Bailing-Lite-0830', + path='Bailing-Lite-1116', token='', # set your key here or in environment variable BAILING_API_KEY url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, - query_per_second=1, - max_seq_len=4096, + max_out_len=11264, batch_size=1, generation_kwargs={ - 'temperature': 0.4, + 'temperature': 0.01, 'top_p': 1.0, 'top_k': -1, 'n': 1, 'logprobs': 1, - 'use_beam_search': False, }, ), ] diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-pro-1120.py similarity index 80% rename from opencompass/configs/models/bailing_api/bailing-lite-0830.py rename to opencompass/configs/models/bailing_api/bailing-pro-1120.py index 88053ce9..cee6c3df 100644 --- a/opencompass/configs/models/bailing_api/bailing-lite-0830.py +++ b/opencompass/configs/models/bailing_api/bailing-pro-1120.py @@ -10,21 +10,19 @@ api_meta_template = dict( models = [ dict( - path='Bailing-Lite-0830', + path='Bailing-Pro-1120', token='', # set your key here or in environment variable BAILING_API_KEY url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, - query_per_second=1, - max_seq_len=4096, + max_out_len=11264, batch_size=1, generation_kwargs={ - 'temperature': 0.4, + 'temperature': 0.01, 'top_p': 1.0, 'top_k': -1, 'n': 1, 'logprobs': 1, - 'use_beam_search': False, }, ), ] diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 316f529b..9d8255be 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -1,13 +1,14 @@ import concurrent import concurrent.futures import os +import random import socket import time -import traceback from typing import Dict, List, Optional, Union import requests from requests.adapters import HTTPAdapter +from requests.exceptions import ConnectionError from urllib3.connection import HTTPConnection try: @@ -21,8 +22,6 @@ from .base_api import BaseAPIModel PromptType = Union[PromptList, str] -BAILING_RETRY_DELAY: int = 30 - class HTTPAdapterWithSocketOptions(HTTPAdapter): @@ -104,7 +103,7 @@ class BailingAPI(BaseAPIModel): def generate( self, inputs: Union[List[str], PromptList], - max_out_len: int = 4096, + max_out_len: int = 11264, ) -> List[str]: """Generate results given a list of inputs. @@ -128,7 +127,7 @@ class BailingAPI(BaseAPIModel): ): i for i, input in enumerate(inputs) } - results = [] + results = [''] * len(inputs) for future in concurrent.futures.as_completed(future_to_m): m = future_to_m[future] # noqa F841 resp = future.result() @@ -136,16 +135,25 @@ class BailingAPI(BaseAPIModel): try: result = resp.json() except Exception as e: # noqa F841 - results.append('') + self.logger.error(f'Fail to inference; ' + f'model_name={self.path}; ' + f'error={e}, ' + f'request={inputs[m]}') else: if (result.get('choices') and result['choices'][0].get('message') and result['choices'][0]['message'].get('content') is not None): - results.append( - result['choices'][0]['message']['content']) + results[m] = \ + result['choices'][0]['message']['content'] + else: + self.logger.error(f'Receive invalid result. ' + f'result={result}; ' + f'request={inputs[m]}') else: - results.append('') + self.logger.error(f'Receive invalid response. ' + f'response={resp}; ' + f'request={inputs[m]}') self.flush() return results @@ -184,39 +192,31 @@ class BailingAPI(BaseAPIModel): message['role'] = item['role'] messages.append(message) request = { - 'model': - self._model, - 'messages': - messages, - 'max_seq_len': - max( - max_out_len if max_out_len else 4096, - self.max_seq_len if self.max_seq_len else 4096, - ), + 'model': self._model, + 'messages': messages, + 'max_tokens': max_out_len, } request.update(self.generation_kwargs) - try: - retry_num = 0 - while retry_num < self.retry: + retry_num = 0 + while retry_num < self.retry: + try: response = self._infer_result(request, sess) - if response.status_code == 200: - break # success - elif response.status_code == 426: - retry_num += 1 # retry - elif response.status_code in [429, 500, 504]: - time.sleep(BAILING_RETRY_DELAY) - retry_num += 1 # retry - else: - raise ValueError(f'Status code = {response.status_code}') + except ConnectionError: + time.sleep(random.randint(10, 30)) + retry_num += 1 # retry + continue + if response.status_code == 200: + break # success + elif response.status_code == 426: + retry_num += 1 # retry + elif response.status_code in [302, 429, 500, 504]: + time.sleep(random.randint(10, 30)) + retry_num += 1 # retry else: - raise ValueError( - f'Exceed the maximal retry times. Last status code ' - f'= {response.status_code}') - except Exception as e: - self.logger.error(f'Fail to inference request={request}; ' - f'model_name={self.path}; error={e}, ' - f'stack:{traceback.format_exc()}') - raise e + raise ValueError(f'Status code = {response.status_code}') + else: + # Exceed the maximal retry times. + return '' return response # @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms From f7dbe6bb7d361bd82f08b53410271435001ebaac Mon Sep 17 00:00:00 2001 From: Junnan Liu Date: Wed, 27 Nov 2024 15:44:41 +0800 Subject: [PATCH 07/19] [Feature] Add Arc Prize Public Evaluation (#1690) * support arc prize * update arc-prize dataset info & update arc-prize evaluation performance --- .../ARC_Prize_Public_Evaluation/README.md | 47 ++++ .../arc_prize_public_evaluation_gen.py | 4 + .../arc_prize_public_evaluation_gen_872059.py | 56 +++++ .../datasets/arc_prize_public_evaluation.py | 213 ++++++++++++++++++ opencompass/utils/datasets_info.py | 9 + 5 files changed, 329 insertions(+) create mode 100644 opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md create mode 100644 opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py create mode 100644 opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py create mode 100644 opencompass/datasets/arc_prize_public_evaluation.py diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md new file mode 100644 index 00000000..3692e542 --- /dev/null +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md @@ -0,0 +1,47 @@ +# ARC Prize Public Evaluation + +#### Overview +The spirit of ARC Prize is to open source progress towards AGI. To win prize money, you will be required to publish reproducible code/methods into public domain. + +ARC Prize measures AGI progress using the [ARC-AGI private evaluation set](https://arcprize.org/guide#private), [the leaderboard is here](https://arcprize.org/leaderboard), and the Grand Prize is unlocked once the first team reaches [at least 85%](https://arcprize.org/guide#grand-prize-goal). + +Note: the private evaluation set imposes limitations on solutions (eg. no internet access, so no GPT-4/Claude/etc). There is a [secondary leaderboard](https://arcprize.org/leaderboard) called ARC-AGI-Pub, it measures the [public evaluation set](https://arcprize.org/guide#public-tasks) and imposes no limits but it is not part of ARC Prize 2024 at this time. + + +#### Tasks +ARC-AGI tasks are a series of three to five input and output tasks followed by a final task with only the input listed. Each task tests the utilization of a specific learned skill based on a minimal number of cognitive priors. + +![alt text](https://arcprize.org/media/images/arc-task-grids.jpg) + +Tasks are represented as JSON lists of integers. These JSON objects can also be represented visually as a grid of colors using an ARC-AGI task viewer. + +A successful submission is a pixel-perfect description (color and position) of the final task's output. + +#### Format + +As mentioned above, tasks are stored in JSON format. Each JSON file consists of two key-value pairs. + +`train`: a list of two to ten input/output pairs (typically three.) These are used for your algorithm to infer a rule. + +`test`: a list of one to three input/output pairs (typically one.) Your model should apply the inferred rule from the train set and construct an output solution. You will have access to the output test solution on the public data. The output solution on the private evaluation set will not be revealed. + +Here is an example of a simple ARC-AGI task that has three training pairs along with a single test pair. Each pair is shown as a 2x2 grid. There are four colors represented by the integers 1, 4, 6, and 8. Which actual color (red/green/blue/black) is applied to each integer is arbitrary and up to you. + +```json +{ + "train": [ + {"input": [[1, 0], [0, 0]], "output": [[1, 1], [1, 1]]}, + {"input": [[0, 0], [4, 0]], "output": [[4, 4], [4, 4]]}, + {"input": [[0, 0], [6, 0]], "output": [[6, 6], [6, 6]]} + ], + "test": [ + {"input": [[0, 0], [0, 8]], "output": [[8, 8], [8, 8]]} + ] +} +``` + +#### Performance + +| Qwen2.5-72B-Instruct | LLaMA3.1-70B-Instruct | gemma-2-27b-it | +| ----- | ----- | ----- | +| 0.09 | 0.06 | 0.05 | \ No newline at end of file diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py new file mode 100644 index 00000000..416fd1cb --- /dev/null +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py new file mode 100644 index 00000000..000e6312 --- /dev/null +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator + + +# The system_prompt defines the initial instructions for the model, +# setting the context for solving ARC tasks. +system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.''' + +# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, +# guiding the model to learn the rule and apply it to solve the given puzzle. +user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input: +---------------------------------------- +{training_data} +---------------------------------------- +Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.: +---------------------------------------- +[{{'input': {input_test_data}, 'output': [[]]}}] +---------------------------------------- +What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:''' + + +arc_prize_public_evaluation_reader_cfg = dict( + input_columns=['training_data', 'input_test_data'], + output_column='output_test_data' +) + +arc_prize_public_evaluation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='SYSTEM', prompt=system_prompt), + dict(role='HUMAN', prompt=user_message_template), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +arc_prize_public_evaluation_eval_cfg = dict( + evaluator=dict(type=ARCPrizeEvaluator) +) + +arc_prize_public_evaluation_datasets = [ + dict( + abbr='ARC_Prize_Public_Evaluation', + type=ARCPrizeDataset, + path='opencompass/arc_prize_public_evaluation', + reader_cfg=arc_prize_public_evaluation_reader_cfg, + infer_cfg=arc_prize_public_evaluation_infer_cfg, + eval_cfg=arc_prize_public_evaluation_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/datasets/arc_prize_public_evaluation.py b/opencompass/datasets/arc_prize_public_evaluation.py new file mode 100644 index 00000000..6c176926 --- /dev/null +++ b/opencompass/datasets/arc_prize_public_evaluation.py @@ -0,0 +1,213 @@ +import ast +import json +import os +from typing import Dict, List + +import numpy as np +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ARCPrizeDataset(BaseDataset): + task_file_names = [ + '2072aba6.json', 'bb52a14b.json', '136b0064.json', 'ea9794b1.json', + '40f6cd08.json', 'f5aa3634.json', '7039b2d7.json', '712bf12e.json', + '9b365c51.json', 'ccd554ac.json', 'f9d67f8b.json', '03560426.json', + 'e2092e0c.json', '8fbca751.json', '42918530.json', 'c64f1187.json', + '00576224.json', '705a3229.json', 'af24b4cc.json', '81c0276b.json', + 'f21745ec.json', '8dae5dfc.json', '4e469f39.json', '695367ec.json', + 'dc2aa30b.json', 'b9630600.json', '770cc55f.json', '3391f8c0.json', + 'c1990cce.json', '1da012fc.json', '50a16a69.json', '212895b5.json', + 'e69241bd.json', '692cd3b6.json', '0bb8deee.json', '9772c176.json', + '22a4bbc2.json', 'ca8de6ea.json', 'dc2e9a9d.json', '4aab4007.json', + 'cfb2ce5a.json', '9f27f097.json', '2c737e39.json', '84db8fc4.json', + 'e1baa8a4.json', 'ea959feb.json', '4f537728.json', '47996f11.json', + 'bf32578f.json', 'aee291af.json', '5d2a5c43.json', '2546ccf6.json', + 'e57337a4.json', 'd4b1c2b1.json', '20981f0e.json', '05a7bcf2.json', + 'fc754716.json', '6ad5bdfd.json', 'e88171ec.json', '1acc24af.json', + '34b99a2b.json', 'e78887d1.json', '4acc7107.json', '137f0df0.json', + '62b74c02.json', '50aad11f.json', '642d658d.json', '64a7c07e.json', + 'bd14c3bf.json', '73c3b0d8.json', 'e0fb7511.json', 'c7d4e6ad.json', + '85b81ff1.json', 'e760a62e.json', 'ca8f78db.json', 'd931c21c.json', + 'aab50785.json', 'ac605cbb.json', '3194b014.json', '68b67ca3.json', + 'e7b06bea.json', 'e5790162.json', 'da2b0fe3.json', '0becf7df.json', + 'fe9372f3.json', 'd56f2372.json', 'e66aafb8.json', 'b7999b51.json', + '2697da3f.json', '516b51b7.json', '9a4bb226.json', '195ba7dc.json', + '310f3251.json', '639f5a19.json', '0d87d2a6.json', 'c663677b.json', + 'e74e1818.json', '69889d6e.json', 'f45f5ca7.json', '8597cfd7.json', + '0c9aba6e.json', 'e9b4f6fc.json', 'e7639916.json', '5207a7b5.json', + 'e4075551.json', '90347967.json', '9ddd00f0.json', '4b6b68e5.json', + 'e9c9d9a1.json', '2f0c5170.json', '58e15b12.json', 'd37a1ef5.json', + '62ab2642.json', 'b457fec5.json', 'c97c0139.json', 'ac0c5833.json', + '7d419a02.json', '4ff4c9da.json', '4cd1b7b2.json', '27a77e38.json', + '66f2d22f.json', '2a5f8217.json', 'c074846d.json', 'c6e1b8da.json', + '319f2597.json', '94be5b80.json', '55783887.json', '60c09cac.json', + 'f823c43c.json', 'd492a647.json', 'e681b708.json', '15663ba9.json', + 'a3f84088.json', '103eff5b.json', '5a5a2103.json', '1e97544e.json', + '009d5c81.json', 'ed74f2f2.json', 'ce039d91.json', 'baf41dbf.json', + '3490cc26.json', 'ce8d95cc.json', '3f23242b.json', '1d0a4b61.json', + '8719f442.json', 'd94c3b52.json', '4c177718.json', '59341089.json', + '3ee1011a.json', 'f5c89df1.json', '5833af48.json', 'd4c90558.json', + '88207623.json', '833dafe3.json', '070dd51e.json', '3ed85e70.json', + '21f83797.json', '7c8af763.json', '5783df64.json', 'a57f2f04.json', + 'e9ac8c9e.json', 'aa18de87.json', '505fff84.json', '5ffb2104.json', + '42a15761.json', '1a2e2828.json', '0607ce86.json', '84f2aca1.json', + '456873bc.json', '903d1b4a.json', '0f63c0b9.json', '54db823b.json', + 'ad7e01d0.json', '8e2edd66.json', '79fb03f4.json', '4364c1c4.json', + 'e7a25a18.json', 'e133d23d.json', 'e21a174a.json', '55059096.json', + 'e95e3d8e.json', '94414823.json', '9356391f.json', '15113be4.json', + 'ba9d41b8.json', '52fd389e.json', 'de493100.json', '9c56f360.json', + 'c92b942c.json', '97239e3d.json', 'b0f4d537.json', '19bb5feb.json', + '506d28a5.json', '5b692c0f.json', 'ef26cbf6.json', 'e345f17b.json', + '7d1f7ee8.json', 'ac3e2b04.json', '551d5bf1.json', 'fb791726.json', + '2037f2c7.json', 'e6de6e8f.json', '3d31c5b3.json', 'd19f7514.json', + '1d398264.json', '358ba94e.json', '696d4842.json', '08573cc6.json', + '7e02026e.json', '7953d61e.json', 'c3202e5a.json', '351d6448.json', + 'fea12743.json', '12422b43.json', 'b942fd60.json', 'bcb3040b.json', + 'e41c6fd3.json', 'a59b95c0.json', '3a301edc.json', '0b17323b.json', + 'da515329.json', '96a8c0cd.json', '6f473927.json', '9def23fe.json', + 'c35c1b4c.json', 'be03b35f.json', '604001fa.json', 'd304284e.json', + 'cb227835.json', 'e9bb6954.json', 'ac2e8ecf.json', '1e81d6f9.json', + '72207abc.json', '37d3e8b2.json', 'c8b7cc0f.json', 'a096bf4d.json', + '1c02dbbe.json', 'fd096ab6.json', '9bebae7a.json', '25094a63.json', + 'b7fb29bc.json', 'aa4ec2a5.json', '50f325b5.json', '423a55dc.json', + 'b0722778.json', 'e7dd8335.json', 'f3cdc58f.json', 'cad67732.json', + '256b0a75.json', 'd282b262.json', '58743b76.json', '6df30ad6.json', + '9110e3c5.json', '48f8583b.json', 'a680ac02.json', '642248e4.json', + '2685904e.json', '48131b3c.json', 'b7cb93ac.json', '73182012.json', + 'df8cc377.json', '3b4c2228.json', '93c31fbe.json', '8ee62060.json', + '9b2a60aa.json', 'f0df5ff0.json', '917bccba.json', 'ed98d772.json', + 'bf89d739.json', 'f3e62deb.json', '11e1fe23.json', 'bbb1b8b6.json', + 'f4081712.json', '817e6c09.json', '45bbe264.json', 'f3b10344.json', + 'fafd9572.json', 'b7f8a4d8.json', '2c0b0aff.json', '8cb8642d.json', + '67c52801.json', 'd47aa2ff.json', '0934a4d8.json', '60a26a3e.json', + 'cf133acc.json', '5289ad53.json', '16b78196.json', '09c534e7.json', + 'f83cb3f6.json', 'd017b73f.json', 'b20f7c8b.json', '5af49b42.json', + '18419cfa.json', '929ab4e9.json', '6a11f6da.json', '17cae0c1.json', + 'e99362f0.json', '1c56ad9f.json', '8a371977.json', 'e633a9e5.json', + 'c658a4bd.json', 'bc4146bd.json', '67636eac.json', '4e45f183.json', + '17b80ad2.json', '94133066.json', 'e1d2900e.json', 'a934301b.json', + '0a2355a6.json', '45737921.json', '332efdb3.json', '7bb29440.json', + 'f9a67cb5.json', 'a8610ef7.json', '32e9702f.json', '0c786b71.json', + '626c0bcc.json', 'aa300dc3.json', 'c62e2108.json', '0692e18c.json', + 'af22c60d.json', '992798f6.json', 'c48954c1.json', '5b526a93.json', + 'ae58858e.json', 'ff72ca3e.json', '2b01abd0.json', '7d18a6fb.json', + '963f59bc.json', '759f3fd3.json', '7c9b52a0.json', '4852f2fa.json', + '14754a24.json', 'c87289bb.json', '845d6e51.json', '281123b4.json', + '79369cc6.json', '0a1d4ef5.json', '477d2879.json', '72a961c9.json', + '67b4a34d.json', 'e5c44e8f.json', 'bf699163.json', '13713586.json', + '27f8ce4f.json', '95a58926.json', '15696249.json', 'd2acf2cb.json', + '140c817e.json', '1990f7a8.json', '782b5218.json', '8b28cd80.json', + '92e50de0.json', 'e619ca6e.json', '5b6cbef5.json', '575b1a71.json', + '66e6c45b.json', '31adaf00.json', '6ea4a07e.json', 'f0afb749.json', + '00dbd492.json', 'b1fc8b8e.json', 'fd4b2b02.json', 'b15fca0b.json', + 'a04b2602.json', '20818e16.json', '762cd429.json', '29700607.json', + 'd5c634a2.json', 'a406ac07.json', '8ba14f53.json', '184a9768.json', + '12997ef3.json', 'dd2401ed.json', 'f8be4b64.json', '12eac192.json', + '31d5ba1a.json', 'b4a43f3b.json', '7ee1c6ea.json', '9b4c17c4.json', + '981571dc.json', '93b4f4b3.json', '9caba7c3.json', '891232d6.json', + '85fa5666.json', '0e671a1a.json', '73ccf9c2.json', '414297c0.json', + 'e872b94a.json', '99306f82.json', '3979b1a8.json', '2753e76c.json', + '1c0d0a4b.json', '292dd178.json', 'cd3c21df.json', '33b52de3.json', + 'ecaa0ec1.json', '896d5239.json', '1a6449f1.json', '9c1e755f.json' + ] + + @staticmethod + def load(path: str): + task_file_dir = get_data_path(path) + + dataset = [] + + task_file_name_list = os.listdir(task_file_dir) + for task_file_name in task_file_name_list: + if task_file_name not in ARCPrizeDataset.task_file_names: + continue + with open(os.path.join(task_file_dir, task_file_name), + 'r') as file: + task = json.load(file) + task = { + 'training_data': task['train'], + 'input_test_data': task['test'][0]['input'], + 'output_test_data': task['test'][0]['output'] + } + dataset.append(task) + + return Dataset.from_list(dataset) + + +class ARCPrizeEvaluator(BaseEvaluator): + + def score(self, predictions: List[str], + references: List[List[int]]) -> Dict: + accuracy = [] + details = [] + for pred, refer in zip(map(extract_solution, predictions), references): + is_correct, correct_percentage = compare_solutions_with_padding( + pred, refer, pad_value=-1) + details.append({ + 'solved': True if is_correct else False, + 'correct_percentage': correct_percentage, + 'generated_solution': pred + }) + accuracy.append(1 if is_correct else 0) + + return {'accuracy': np.mean(accuracy), 'details': details} + + +def extract_solution(text): + try: + # Find the part of the text that looks like a nested list + start = text.index('[[') + end = text.index(']]', start) + 2 + array_str = text[start:end] + + # Use ast.literal_eval to safely evaluate the + # string as a Python expression + array = ast.literal_eval(array_str) + # Check if the result is a list of lists + if all(isinstance(i, list) for i in array): + if all(all(isinstance(i, int) for i in j) for j in array): + return array + else: + return [[0]] + else: + return [[0]] + except (ValueError, SyntaxError): + return [[0]] + + +def pad_array_with_value(array, target_shape, pad_value): + padded_array = np.full(target_shape, pad_value, dtype=int) + for i in range(len(array)): + padded_array[i, :len(array[i])] = array[i] + return padded_array + + +def compare_solutions_with_padding(generated_output: List[int], + correct_output: List[int], + pad_value=-1): + max_rows = max(len(generated_output), len(correct_output)) + max_cols = max(max(map(len, generated_output)), + max(map(len, correct_output))) + target_shape = (max_rows, max_cols) + + padded_generated = pad_array_with_value(generated_output, target_shape, + pad_value) + padded_correct = pad_array_with_value(correct_output, target_shape, + pad_value) + + total_pixels = max_rows * max_cols + correct_pixels = np.sum((padded_generated == padded_correct) + & (padded_generated != pad_value) + & (padded_correct != pad_value)) + correct_percentage = (correct_pixels / total_pixels) * 100 + + is_correct = (correct_pixels == total_pixels) + + return is_correct, correct_percentage diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 21e640c8..b0d1b93f 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -343,6 +343,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/babilong/data/", }, + "opencompass/arc_prize_public_evaluation": { + "ms_id": "", + "hf_id": "", + "local": "./data/arc_prize_public_evaluation", + } } DATASETS_URL = { @@ -557,5 +562,9 @@ DATASETS_URL = { "subjective/judgerbench": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", "md5": "60d605883aa8cac9755819140ab42c6b" + }, + "/arc_prize_public_evaluation": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", + "md5": "367a33977651496efddba7670009807e" } } From 90efcf22164d2a78519a2ac979a9649a80f7c9e8 Mon Sep 17 00:00:00 2001 From: wanyu2018umac <42405907+wanyu2018umac@users.noreply.github.com> Date: Wed, 27 Nov 2024 21:26:18 +0800 Subject: [PATCH 08/19] [Feature] Add P-MMEval (#1714) * Update with PMMEval * Update * Update __init__.py * Fix Bugs * Delete .pre-commit-config.yaml * Pull merge --------- Co-authored-by: liushz --- configs/eval_PMMEval.py | 32 +++ .../configs/datasets/PMMEval/flores_gen.py | 4 + .../datasets/PMMEval/flores_gen_2697d7.py | 65 +++++ .../datasets/PMMEval/humanevalxl_gen.py | 4 + .../PMMEval/humanevalxl_gen_bdec92.py | 49 ++++ .../configs/datasets/PMMEval/mgsm_gen.py | 4 + .../datasets/PMMEval/mgsm_gen_679720.py | 62 +++++ .../datasets/PMMEval/mhellaswag_gen.py | 4 + .../datasets/PMMEval/mhellaswag_gen_1a6b73.py | 54 +++++ .../configs/datasets/PMMEval/mifeval_gen.py | 4 + .../datasets/PMMEval/mifeval_gen_79f8fb.py | 51 ++++ .../configs/datasets/PMMEval/mlogiqa_gen.py | 4 + .../datasets/PMMEval/mlogiqa_gen_36c4f9.py | 50 ++++ .../configs/datasets/PMMEval/mmmlu_gen.py | 4 + .../datasets/PMMEval/mmmlu_gen_d5017d.py | 52 ++++ .../configs/datasets/PMMEval/pmmeval_gen.py | 14 ++ .../configs/datasets/PMMEval/xnli_gen.py | 4 + .../datasets/PMMEval/xnli_gen_973734.py | 60 +++++ opencompass/configs/summarizers/PMMEval.py | 22 ++ .../configs/summarizers/groups/PMMEval.py | 41 ++++ opencompass/datasets/PMMEval/__init__.py | 8 + opencompass/datasets/PMMEval/flores.py | 162 +++++++++++++ opencompass/datasets/PMMEval/humanevalxl.py | 226 ++++++++++++++++++ opencompass/datasets/PMMEval/mgsm.py | 79 ++++++ opencompass/datasets/PMMEval/mhellaswag.py | 151 ++++++++++++ opencompass/datasets/PMMEval/mifeval.py | 147 ++++++++++++ .../PMMEval/mifeval_utils/__init__.py | 17 ++ .../mifeval_utils/combination_checker.py | 32 +++ .../detectable_content_checker.py | 30 +++ .../detectable_format_checker.py | 122 ++++++++++ .../PMMEval/mifeval_utils/keywords_checker.py | 12 + .../length_constraints_checker.py | 93 +++++++ .../mifeval_utils/punctuation_checker.py | 30 +++ .../PMMEval/mifeval_utils/startend_checker.py | 38 +++ opencompass/datasets/PMMEval/mlogiqa.py | 152 ++++++++++++ opencompass/datasets/PMMEval/mmmlu.py | 157 ++++++++++++ opencompass/datasets/PMMEval/xnli.py | 150 ++++++++++++ opencompass/utils/datasets_info.py | 11 +- 38 files changed, 2200 insertions(+), 1 deletion(-) create mode 100755 configs/eval_PMMEval.py create mode 100755 opencompass/configs/datasets/PMMEval/flores_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py create mode 100755 opencompass/configs/datasets/PMMEval/humanevalxl_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py create mode 100755 opencompass/configs/datasets/PMMEval/mgsm_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py create mode 100755 opencompass/configs/datasets/PMMEval/mhellaswag_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py create mode 100755 opencompass/configs/datasets/PMMEval/mifeval_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py create mode 100755 opencompass/configs/datasets/PMMEval/mlogiqa_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py create mode 100755 opencompass/configs/datasets/PMMEval/mmmlu_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py create mode 100755 opencompass/configs/datasets/PMMEval/pmmeval_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/xnli_gen.py create mode 100755 opencompass/configs/datasets/PMMEval/xnli_gen_973734.py create mode 100644 opencompass/configs/summarizers/PMMEval.py create mode 100644 opencompass/configs/summarizers/groups/PMMEval.py create mode 100755 opencompass/datasets/PMMEval/__init__.py create mode 100755 opencompass/datasets/PMMEval/flores.py create mode 100755 opencompass/datasets/PMMEval/humanevalxl.py create mode 100755 opencompass/datasets/PMMEval/mgsm.py create mode 100755 opencompass/datasets/PMMEval/mhellaswag.py create mode 100755 opencompass/datasets/PMMEval/mifeval.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/__init__.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py create mode 100755 opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py create mode 100755 opencompass/datasets/PMMEval/mlogiqa.py create mode 100755 opencompass/datasets/PMMEval/mmmlu.py create mode 100755 opencompass/datasets/PMMEval/xnli.py diff --git a/configs/eval_PMMEval.py b/configs/eval_PMMEval.py new file mode 100755 index 00000000..0e6adf58 --- /dev/null +++ b/configs/eval_PMMEval.py @@ -0,0 +1,32 @@ +from mmengine.config import read_base + +from opencompass.models import HuggingFacewithChatTemplate + + +with read_base(): + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models + + # from opencompass.configs.datasets.PMMEval.flores_gen import PMMEval_flores_datasets + # from opencompass.configs.datasets.PMMEval.humanevalxl_gen import PMMEval_HumanEvalXL_datasets + # from opencompass.configs.datasets.PMMEval.mgsm_gen import PMMEval_MGSM_datasets + # from opencompass.configs.datasets.PMMEval.mhellaswag_gen import PMMEval_MHellaswag_datasets + # from opencompass.configs.datasets.PMMEval.mifeval_gen import PMMEval_MIFEval_datasets + # from opencompass.configs.datasets.PMMEval.mlogiqa_gen import PMMEval_MLogiQA_datasets + # from opencompass.configs.datasets.PMMEval.mmmlu_gen import PMMEval_MMMLU_datasets + # from opencompass.configs.datasets.PMMEval.xnli import PMMEval_XNLI_datasets + + from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets + + from opencompass.configs.summarizers.PMMEval import summarizer + + +# datasets = PMMEval_flores_datasets +# datasets = PMMEval_HumanEvalXL_datasets +# datasets = PMMEval_MGSM_datasets +# datasets = PMMEval_MHellaswag_datasets +# datasets = PMMEval_MIFEval_datasets +# datasets = PMMEval_MLogiQA_datasets +# datasets = PMMEval_MMMLU_datasets +# datasets = PMMEval_XNLI_datasets + +datasets = PMMEval_datasets diff --git a/opencompass/configs/datasets/PMMEval/flores_gen.py b/opencompass/configs/datasets/PMMEval/flores_gen.py new file mode 100755 index 00000000..b076c62f --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/flores_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .flores_gen_2697d7 import PMMEval_flores_datasets diff --git a/opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py b/opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py new file mode 100755 index 00000000..eb8ffd28 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py @@ -0,0 +1,65 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalFloresDataset, PMMEvalFloresEvaluator, pmmeval_flores_postprocess + +NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese'] + +PROMPT = { + "Chinese": "将这个句子从英语翻译成中文。\n\n{src}", + "Arabic": "ترجم هذه الجملة من الإنجليزية إلى العربية.\n\n{src}", + "Spanish": "Traduce esta oración del inglés al español.\n\n{src}", + "Japanese": "この文を英語から日本語に翻訳してください。\n\n{src}", + "Korean": "이 문장을 영어에서 한국어로 번역하세요.\n\n{src}", + "Thai": "แปลประโยคนี้จากภาษาอังกฤษเป็นภาษาไทย.\n\n{src}", + "French": "Traduisez cette phrase de l'anglais en français.\n\n{src}", + "Portuguese": "Traduza esta frase do inglês para o português.\n\n{src}", + "Vietnamese": "Dịch câu này từ tiếng Anh sang tiếng Việt.\n\n{src}" +} + +PMMEval_flores_datasets = list() + +# Add flores_200 + +PMMEval_flores_reader_cfg = dict( + input_columns=['src'], + output_column='tgt', + test_split='test' +) + + +PMMEval_flores_datasets = list() + +for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES: + PMMEval_flores_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PROMPT[lang_fullname] + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + PMMEval_flores_eval_cfg = dict( + evaluator=dict(type=PMMEvalFloresEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_flores_postprocess, lang_fullname=lang_fullname) + ) + + PMMEval_flores_datasets.append( + dict( + abbr=f'flores-{lang_fullname}', + type=PMMEvalFloresDataset, + path='P-MMEval', + lang_fullname=lang_fullname, + reader_cfg=PMMEval_flores_reader_cfg, + infer_cfg=PMMEval_flores_infer_cfg, + eval_cfg=PMMEval_flores_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/humanevalxl_gen.py b/opencompass/configs/datasets/PMMEval/humanevalxl_gen.py new file mode 100755 index 00000000..f6f3fcb8 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/humanevalxl_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets diff --git a/opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py b/opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py new file mode 100755 index 00000000..ceb53570 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalHumanEvalXLDataset, PMMEvalHumanEvalXLEvaluator + +NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese'] + +PMMEval_HumanEvalXL_datasets = list() + +PMMEval_HumanEvalXL_reader_cfg = dict( + input_columns=['task_id', 'prompt', 'entry_point', 'test', 'language', 'description', 'natural_language'], + output_column='declaration', + test_split='test' +) + +PMMEval_HumanEvalXL_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +PMMEval_HumanEvalXL_datasets = list() + +for lang_fullname in NATURAL_LANGUAGE_FULLNAMES: + for program_lang in ['python', 'java', 'javascript']: + + PMMEval_HumanEvalXL_eval_cfg = dict( + evaluator=dict( + type=PMMEvalHumanEvalXLEvaluator, + language=program_lang, + text_language=lang_fullname, + ip_address='localhost', + port=5001), + pred_role='BOT') + + PMMEval_HumanEvalXL_datasets.append( + dict( + abbr=f'humanevalxl-{program_lang}-{lang_fullname}', + type=PMMEvalHumanEvalXLDataset, + path='P-MMEval', + lang=lang_fullname, + program_lang=program_lang, + reader_cfg=PMMEval_HumanEvalXL_reader_cfg, + infer_cfg=PMMEval_HumanEvalXL_infer_cfg, + eval_cfg=PMMEval_HumanEvalXL_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/mgsm_gen.py b/opencompass/configs/datasets/PMMEval/mgsm_gen.py new file mode 100755 index 00000000..bbfafff1 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mgsm_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mgsm_gen_679720 import PMMEval_MGSM_datasets diff --git a/opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py b/opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py new file mode 100755 index 00000000..f74ab8e0 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py @@ -0,0 +1,62 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalMGSMDataset, PMMEvalMGSMEvaluator + +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] + +LANG_TO_INSTRUCTIONS = { + "en": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"The answer is \". Do not add anything other than the integer answer after \"The answer is \".\n\n{question}", + "es": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La respuesta es \". Do not add anything other than the integer answer after \"La respuesta es \".\n\n{question}", + "fr": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La réponse est \". Do not add anything other than the integer answer after \"La réponse est \".\n\n{question}", + "zh": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答案是 \". Do not add anything other than the integer answer after \"答案是 \".\n\n{question}", + "ja": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答えは \". Do not add anything other than the integer answer after \"答えは \".\n\n{question}", + "th": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"คำตอบคือ \". Do not add anything other than the integer answer after \"คำตอบคือ \".\n\n{question}", + "ko": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"답변은 \". Do not add anything other than the integer answer after \"답변은 \".\n\n{question}", + "pt": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"A resposta é \". Do not add anything other than the integer answer after \"A resposta é \".\n\n{question}", + "vi": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"Câu trả lời là \". Do not add anything other than the integer answer after \"Câu trả lời là \".\n\n{question}", + "ar": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"الجواب هو \". Do not add anything other than the integer answer after \"الجواب هو \".\n\n{question}" +} + +PMMEval_MGSM_datasets = list() + +# Add flores_200 + +PMMEval_MGSM_reader_cfg = dict( + input_columns=['question'], + output_column='answer', + test_split='test' +) + +PMMEval_MGSM_eval_cfg = dict( + evaluator=dict(type=PMMEvalMGSMEvaluator), + pred_role='BOT') + + +for lang_code in NATURAL_LANGUAGE_CODES: + PMMEval_MGSM_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=LANG_TO_INSTRUCTIONS[lang_code] + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + PMMEval_MGSM_datasets.append( + dict( + abbr=f'mgsm-{lang_code}', + type=PMMEvalMGSMDataset, + path='P-MMEval', + lang=lang_code, + reader_cfg=PMMEval_MGSM_reader_cfg, + infer_cfg=PMMEval_MGSM_infer_cfg, + eval_cfg=PMMEval_MGSM_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/mhellaswag_gen.py b/opencompass/configs/datasets/PMMEval/mhellaswag_gen.py new file mode 100755 index 00000000..7a217bf8 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mhellaswag_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets diff --git a/opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py b/opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py new file mode 100755 index 00000000..2c40e57a --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py @@ -0,0 +1,54 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalMHellaswagDataset, PMMEvalMHellaswagEvaluator, pmmeval_mhellaswag_postprocess + +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] + +PMMEVAL_MHELLASWAG_TEMPLATE = "Input: {ctx}\nOptions: \nA. {option_1}\nB. {option_2}\nC. {option_3}\nD. {option_4}\nPick the correct ending for the sentence from A, B, C, and D, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C or D." + +PMMEval_MHellaswag_datasets = list() + +PMMEval_MHellaswag_reader_cfg = dict( + input_columns=['ctx', 'option_1', 'option_2', 'option_3', 'option_4'], + output_column='label', + test_split='test' +) + +PMMEval_MHellaswag_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PMMEVAL_MHELLASWAG_TEMPLATE + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +PMMEval_MHellaswag_datasets = list() + + +for lang_code in NATURAL_LANGUAGE_CODES: + PMMEval_MHellaswag_eval_cfg = dict( + evaluator=dict(type=PMMEvalMHellaswagEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_mhellaswag_postprocess, lang_code=lang_code) + ) + + PMMEval_MHellaswag_datasets.append( + dict( + abbr=f'mhellaswag-{lang_code}', + type=PMMEvalMHellaswagDataset, + path='P-MMEval', + lang=lang_code, + reader_cfg=PMMEval_MHellaswag_reader_cfg, + infer_cfg=PMMEval_MHellaswag_infer_cfg, + eval_cfg=PMMEval_MHellaswag_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/mifeval_gen.py b/opencompass/configs/datasets/PMMEval/mifeval_gen.py new file mode 100755 index 00000000..66155d34 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mifeval_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets diff --git a/opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py b/opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py new file mode 100755 index 00000000..1c4655b4 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalMIFEvalDataset, PMMEvalMIFEvalEvaluator, pmmeval_mifeval_postprocess + +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] + +PMMEVAL_MIFEVAL_TEMPLATE = "{prompt}" + +PMMEval_MIFEval_datasets = list() + +PMMEval_MIFEval_reader_cfg = dict( + input_columns=['prompt', 'instruction_id_list', 'kwargs'], + output_column=None, + test_split='test' +) + + +PMMEval_MIFEval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PMMEVAL_MIFEVAL_TEMPLATE + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +for lang_code in NATURAL_LANGUAGE_CODES: + PMMEval_MIFEval_eval_cfg = dict( + evaluator=dict(type=PMMEvalMIFEvalEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_mifeval_postprocess, lang_code=lang_code) + ) + + PMMEval_MIFEval_datasets.append( + dict( + abbr=f'mifeval-{lang_code}', + type=PMMEvalMIFEvalDataset, + path='P-MMEval', + lang=lang_code, + reader_cfg=PMMEval_MIFEval_reader_cfg, + infer_cfg=PMMEval_MIFEval_infer_cfg, + eval_cfg=PMMEval_MIFEval_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/mlogiqa_gen.py b/opencompass/configs/datasets/PMMEval/mlogiqa_gen.py new file mode 100755 index 00000000..60c3751a --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mlogiqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets diff --git a/opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py b/opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py new file mode 100755 index 00000000..e755f802 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py @@ -0,0 +1,50 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalMLogiQADataset, PMMEvalMLogiQAEvaluator, pmmeval_mlogiqa_postprocess + +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] + +PMMEVAL_MLOGIQA_TEMPLATE = "Passage: {context}\nQuestion: {question}\nChoices:\nA.{option_1}\nB.{option_2}\nC.{option_3}\nD.{option_4}\nPlease choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{'answer': '[choice]'}\nwhere [choice] must be one of A, B, C and D." + +PMMEval_MLogiQA_datasets = [] + + +PMMEval_MLogiQA_reader_cfg = dict( + input_columns=['context', 'question', 'option_1', 'option_2', 'option_3', 'option_4'], + output_column='answer', + train_split='test') + +PMMEval_MLogiQA_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PMMEVAL_MLOGIQA_TEMPLATE + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +for lang_code in NATURAL_LANGUAGE_CODES: + PMMEval_MLogiQA_eval_cfg = dict( + evaluator=dict(type=PMMEvalMLogiQAEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_mlogiqa_postprocess, lang_code=lang_code)) + + PMMEval_MLogiQA_datasets.append( + dict( + abbr=f'mlogiqa-{lang_code}', + type=PMMEvalMLogiQADataset, + path='P-MMEval', + lang=lang_code, + reader_cfg=PMMEval_MLogiQA_reader_cfg, + infer_cfg=PMMEval_MLogiQA_infer_cfg, + eval_cfg=PMMEval_MLogiQA_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/mmmlu_gen.py b/opencompass/configs/datasets/PMMEval/mmmlu_gen.py new file mode 100755 index 00000000..60d82ca2 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mmmlu_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets diff --git a/opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py b/opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py new file mode 100755 index 00000000..61a9a96f --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/mmmlu_gen_d5017d.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalMMMLUDataset, PMMEvalMMMLUEvaluator, pmmeval_mmmlu_postprocess + +NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT'] + +PMMEVAL_MMMLU_TEMPLATE = "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C and D.\n\n{Question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}" + +PMMEval_MMMLU_datasets = [] + + +PMMEval_MMMLU_reader_cfg = dict( + input_columns=['Question', 'A', 'B', 'C', 'D'], + output_column='Answer', + train_split='test') + + +PMMEval_MMMLU_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PMMEVAL_MMMLU_TEMPLATE + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +for lang_code in NATURAL_LANGUAGE_CODES_MMMLU: + PMMEval_MMMLU_eval_cfg = dict( + evaluator=dict(type=PMMEvalMMMLUEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_mmmlu_postprocess, lang_code=lang_code)) + + PMMEval_MMMLU_datasets.append( + dict( + abbr=f'mmmlu-{lang_code}', + type=PMMEvalMMMLUDataset, + path='P-MMEval', + lang=lang_code, + difficulty='all', + reader_cfg=PMMEval_MMMLU_reader_cfg, + infer_cfg=PMMEval_MMMLU_infer_cfg, + eval_cfg=PMMEval_MMMLU_eval_cfg) + ) diff --git a/opencompass/configs/datasets/PMMEval/pmmeval_gen.py b/opencompass/configs/datasets/PMMEval/pmmeval_gen.py new file mode 100755 index 00000000..9fce1773 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/pmmeval_gen.py @@ -0,0 +1,14 @@ +from mmengine.config import read_base + +with read_base(): + from .flores_gen_2697d7 import PMMEval_flores_datasets + from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets + from .mgsm_gen_679720 import PMMEval_MGSM_datasets + from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets + from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets + from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets + from .mmmlu_gen_d5017d import PMMEval_MMMLU_datasets + from .xnli_gen_973734 import PMMEval_XNLI_datasets + + +PMMEval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/opencompass/configs/datasets/PMMEval/xnli_gen.py b/opencompass/configs/datasets/PMMEval/xnli_gen.py new file mode 100755 index 00000000..021bfa73 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/xnli_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .xnli_gen_973734 import PMMEval_XNLI_datasets diff --git a/opencompass/configs/datasets/PMMEval/xnli_gen_973734.py b/opencompass/configs/datasets/PMMEval/xnli_gen_973734.py new file mode 100755 index 00000000..a9d67627 --- /dev/null +++ b/opencompass/configs/datasets/PMMEval/xnli_gen_973734.py @@ -0,0 +1,60 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.PMMEval import PMMEvalXNLIDataset, PMMEvalXNLIEvaluator, pmmeval_xnli_postprocess + +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] + +PMMEVAL_XNLI_TEMPLATE = """Take the following as truth: {premise} +Then the following statement: \"{statement}\" is +Options: +A. true +B. inconclusive +C. false +Select the correct option from A, B, and C, and return it in the following JSON format: +{"answer": "[choice]"} +where [choice] must be one of A, B, and C.""" + +PMMEval_XNLI_datasets = list() + +# Add flores_200 + +PMMEval_XNLI_reader_cfg = dict( + input_columns=['premise', 'statement'], + output_column='answer', + test_split='test' +) + + +PMMEval_XNLI_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=PMMEVAL_XNLI_TEMPLATE + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + +for lang_code in NATURAL_LANGUAGE_CODES: + PMMEval_XNLI_eval_cfg = dict( + evaluator=dict(type=PMMEvalXNLIEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=pmmeval_xnli_postprocess, lang_code=lang_code)) + + PMMEval_XNLI_datasets.append( + dict( + abbr=f'xnli-{lang_code}', + type=PMMEvalXNLIDataset, + path='P-MMEval', + lang=lang_code, + reader_cfg=PMMEval_XNLI_reader_cfg, + infer_cfg=PMMEval_XNLI_infer_cfg, + eval_cfg=PMMEval_XNLI_eval_cfg) + ) diff --git a/opencompass/configs/summarizers/PMMEval.py b/opencompass/configs/summarizers/PMMEval.py new file mode 100644 index 00000000..6aaaebb7 --- /dev/null +++ b/opencompass/configs/summarizers/PMMEval.py @@ -0,0 +1,22 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.PMMEval import PMMEval_summary_groups + + +summarizer = dict( + dataset_abbrs=[ + 'flores', + 'humanevalxl', + 'mgsm', + 'mhellaswag', + 'mifeval', + 'mlogiqa', + 'mmmlu', + 'xnli' + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) + diff --git a/opencompass/configs/summarizers/groups/PMMEval.py b/opencompass/configs/summarizers/groups/PMMEval.py new file mode 100644 index 00000000..6f5976d0 --- /dev/null +++ b/opencompass/configs/summarizers/groups/PMMEval.py @@ -0,0 +1,41 @@ +NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese'] +NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese'] +NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi'] +NATURAL_LANGUAGE_CODES_MMMLU = ['EN-US', 'ZH-CN', 'AR-XY', 'ES-LA', 'FR-FR', 'JA-JP', 'KO-KR', 'PT-BR', 'TH-TL', 'VI-VT'] + +PMMEval_summary_groups = [ + { + 'name': 'flores', + 'subsets': [f'flores-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES] + }, + { + 'name': 'humanevalxl', + 'subsets': [f'humanevalxl-python-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \ + [f'humanevalxl-java-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + \ + [f'humanevalxl-javascript-{lang_fullname}' for lang_fullname in NATURAL_LANGUAGE_FULLNAMES] + }, + { + 'name': 'mgsm', + 'subsets': [f'mgsm-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES] + }, + { + 'name': 'mhellaswag', + 'subsets': [f'mhellaswag-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES] + }, + { + 'name': 'mifeval', + 'subsets': [f'mifeval-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES] + }, + { + 'name': 'mlogiqa', + 'subsets': [f'mlogiqa-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES] + }, + { + 'name': 'mmmlu', + 'subsets': [f'mmmlu-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES_MMMLU] + }, + { + 'name': 'xnli', + 'subsets': [f'xnli-{lang_code}' for lang_code in NATURAL_LANGUAGE_CODES] + } +] diff --git a/opencompass/datasets/PMMEval/__init__.py b/opencompass/datasets/PMMEval/__init__.py new file mode 100755 index 00000000..b50af839 --- /dev/null +++ b/opencompass/datasets/PMMEval/__init__.py @@ -0,0 +1,8 @@ +from .flores import * # noqa: F401, F403 +from .humanevalxl import * # noqa: F401, F403 +from .mgsm import * # noqa: F401, F403 +from .mhellaswag import * # noqa: F401, F403 +from .mifeval import * # noqa: F401, F403 +from .mlogiqa import * # noqa: F401, F403 +from .mmmlu import * # noqa: F401, F403 +from .xnli import * # noqa: F401, F403 diff --git a/opencompass/datasets/PMMEval/flores.py b/opencompass/datasets/PMMEval/flores.py new file mode 100755 index 00000000..d649a116 --- /dev/null +++ b/opencompass/datasets/PMMEval/flores.py @@ -0,0 +1,162 @@ +import json +import os +import re +from typing import Tuple + +import numpy as np +from datasets import Dataset +from sacrebleu.metrics import BLEU +from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a +from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + + +def wmt_postprocess(text: str, lang: str) -> str: + text = text.strip() + texts = list(x.strip() for x in text.split('\n')) + texts = list(x for x in texts if x != '') + text = '\n'.join(texts) + text = tokenize(text, lang) + return text + + +def compute_maximum_bleu_value(gen: str, ref: str, lang: str): + gens = list(x.strip() for x in gen.split('\n')) + gens = list(x for x in gens if x != '') + + gens_tokens = list(wmt_postprocess(x, lang) for x in gens) + ref_tokens = wmt_postprocess(ref, lang) + + scorer = BLEU(tokenize='13a', effective_order=True) + + maximum_bleu_value = -100.0 + maximum_bleu_object = None + + for i in range(0, len(gens_tokens)): + for j in range(i, len(gens_tokens)): + gens_tokens_region = ' '.join(gens_tokens[i:j + 1]) + sentence_bleu = scorer.sentence_score(gens_tokens_region, + [ref_tokens]) + + if sentence_bleu.score > maximum_bleu_value: + maximum_bleu_value = sentence_bleu.score + maximum_bleu_object = sentence_bleu + + if maximum_bleu_object is None: + sentence_bleu = scorer.sentence_score('', [ref_tokens]) + return sentence_bleu + else: + return maximum_bleu_object + + +def trim_multiple_space(tokes): + return ''.join(tokes).strip().split() + + +class SpaceTokenizer(object): + + def __call__(self, sent): + if type(sent) == list: + print(sent) + raise ValueError() + return ' '.join(sent.strip().split()) + + +class NonASCIITokenizer(object): + + def __init__(self): + self.is_cjk = re.compile('([\u2e80-\u9fff]|' # 中日韩 + '[\ua960-\ua97f]|' # 谚文字母扩展A + '[\uac00-\ud7ff]|' # 谚文音节+谚文字母扩展B + '[\u0E00-\u0E7F]' # 泰文 + ')') + + def __call__(self, sent): + sent = sent.strip() + chs = list(sent) + line_chtok = [] + for ch in chs: + if self.is_cjk.match(ch): + line_chtok.append(' ') + line_chtok.append(ch) + line_chtok.append(' ') + else: + line_chtok.append(ch) + line_chtok = trim_multiple_space(line_chtok) + return ' '.join(line_chtok) + + +def build_tokenizer(lang: str): + if lang == 'Chinese': + return TokenizerZh() + elif lang in {'Japanese', 'Korean', 'Thai'}: + return NonASCIITokenizer() + else: + return SpaceTokenizer() + + +def tokenize(sent, lang): + tokenizer = build_tokenizer(lang) + final_tokenizer = Tokenizer13a() + return final_tokenizer(tokenizer(sent)) + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_flores') +def pmmeval_flores_postprocess(text: str, lang_fullname: str) -> Tuple[str]: + return text, lang_fullname + + +@LOAD_DATASET.register_module() +class PMMEvalFloresDataset(BaseDataset): + + @staticmethod + def load(path: str, lang_fullname: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='flores', + split=f'test/{lang_fullname}') + else: + dataset = list() + filename = os.path.join(data_path, + f'flores/test/{lang_fullname}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalFloresEvaluator(BaseEvaluator): + + def score(self, predictions, references): + maximum_bleu_results = list() + for (pred, tgt_lang), ref in zip(predictions, references): + maximum_bleu_results.append( + compute_maximum_bleu_value(pred, ref, tgt_lang)) + + maximum_corpus_bleu_counts = sum( + np.array(x.counts) for x in maximum_bleu_results).tolist() + maximum_corpus_bleu_totals = sum( + np.array(x.totals) for x in maximum_bleu_results).tolist() + maximum_corpus_bleu_sys_len = sum(x.sys_len + for x in maximum_bleu_results) + maximum_corpus_bleu_ref_len = sum(x.ref_len + for x in maximum_bleu_results) + + maximum_bleu_result = BLEU.compute_bleu( + correct=maximum_corpus_bleu_counts, + total=maximum_corpus_bleu_totals, + sys_len=maximum_corpus_bleu_sys_len, + ref_len=maximum_corpus_bleu_ref_len) + + result = {'BLEU': round(maximum_bleu_result.score, 2)} + return result diff --git a/opencompass/datasets/PMMEval/humanevalxl.py b/opencompass/datasets/PMMEval/humanevalxl.py new file mode 100755 index 00000000..7edbbabb --- /dev/null +++ b/opencompass/datasets/PMMEval/humanevalxl.py @@ -0,0 +1,226 @@ +import json +import os +import os.path as osp +import re +import subprocess +import tempfile +import time +from shutil import copyfile + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.datasets.humaneval import humaneval_postprocess_v2 +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +_LANGUAGE_NAME_DICT = { + 'java': 'Java', + 'javascript': 'JavaScript', + 'js': 'JavaScript', + 'python': 'Python', +} + + +@LOAD_DATASET.register_module() +class PMMEvalHumanEvalXLDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str, program_lang: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='humaneval-xl', + split=f'test/{program_lang}/{lang}') + else: + dataset = list() + filename = os.path.join( + data_path, f'humaneval-xl/test/{program_lang}/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalHumanEvalXLEvaluator(BaseEvaluator): + + def __init__(self, + language, + ip_address='localhost', + text_language='', + port='', + retry=2, + timeout=600) -> None: + assert language in _LANGUAGE_NAME_DICT.keys(), ( + f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}') + if language == 'rust': + timeout *= 10 # rust need more time + self.language = language + self.text_language = text_language + self.ip_address = ip_address + self.port = port + self.retry = retry + self.timeout = timeout + super().__init__() + + def score(self, predictions, references): + predictions = [{ + 'task_id': + f'{_LANGUAGE_NAME_DICT[self.language]}/{i}', + 'generation': + _clean_up_code(pred, self.language, refer), + } for i, (pred, refer) in enumerate(zip(predictions, references))] + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_out_path = osp.join( + tmp_dir, + f'humanevalx_{self.language}_{self.text_language}.json') + with open(tmp_out_path, 'w') as f: + for pred in predictions: + f.write(json.dumps(pred) + '\n') + + num_retry = 0 + while num_retry < self.retry: + succeed, output = self._code_eval_service( + file_path=tmp_out_path) + if not succeed and '(56) Recv failure' in output: + # only retry when connection failed + num_retry += 1 + # wait a min in case the service load is too high + time.sleep(60) + else: + break + + if succeed: + if isinstance(output, str): + return json.loads(output) + elif isinstance(output, dict): + return output + + ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html' # noqa + if hasattr(self, '_out_dir'): + result_file_path = re.sub('results', 'mid_results', + self._out_dir) + '.json' # noqa + if not osp.exists(osp.dirname(result_file_path)): + os.makedirs(osp.dirname(result_file_path)) + else: + result_file_path = os.path.join( + 'outputs', f'humanevalx_{self.language}.json') + copyfile(tmp_out_path, result_file_path) + raise Exception( + f'Call CodeEvalService Error in `HumanevalXEvaluator`, The ' + f"results have been saved in path '{result_file_path}', You " + 'need to check that your code evaluate service is launched and' + f' the network to service is connected, you can also get ' + f'results directly by using `curl` command refer to {ref_url}.' + f'\nError Information: {output}') + + def _code_eval_service(self, file_path): + if self.port: + eval_server_url = f'{self.ip_address}:{self.port}/evaluate' + else: + eval_server_url = f'{self.ip_address}/evaluate' + exec_result = subprocess.run([ + 'curl', '-X', 'POST', '-F', f'file=@{file_path}', '-F', + f'dataset=humanevalx/{self.language}', f'{eval_server_url}' + ], + timeout=self.timeout, + capture_output=True) + if exec_result.returncode == 0 and re.match( + "\"{.*:.*}\"", exec_result.stdout.decode('utf-8')): + return True, json.loads(exec_result.stdout.decode('utf-8')) + else: + if exec_result.stderr: + try: + err = exec_result.stderr.decode() + except Exception: + err = exec_result.stderr + else: + try: + err = exec_result.stdout.decode() + except Exception: + err = exec_result.stdout + return False, err + + +def _clean_up_code(text: str, language_type: str, reference) -> str: + """Cleans up the generated code.""" + try: + # for chatGLM related text + eval_text = eval(text) + except Exception: + pass + else: + if isinstance(eval_text, str): + text = eval_text + # extract code from code block + text = text.lstrip('\n') + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # in case starting with ```xxx + text = text[max(text.find('\n') + 1, 0):] + if language_type.lower() == 'python': + text = humaneval_postprocess_v2(text) + # we need to take care of the first line + # append extra space for first line for correct indentation + text = ' ' + text.lstrip() + + text_splits = text.split('\n') + is_empty_line = False + ind_empty_line = None + for i, line in enumerate(text_splits): + if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t': + is_empty_line = True + ind_empty_line = i + break + if is_empty_line: + text = '\n'.join(text_splits[:ind_empty_line]) + else: + end_words = [ + '\ndef', '\nclass', '\n#', '\nassert', '\n"""', '\nprint', + '\nif', '\n\n\n' + ] + for w in end_words: + if w in text: + text = text[:text.rfind(w)] + # strip function head for all other language + func_name = reference.strip().split('\n')[-1] + if func_name: + func_name = func_name.strip().strip('{') + if func_name in text: + text = '\n'.join(text[text.find(func_name):].split('\n')[1:]) + if language_type.lower() == 'java': + main_pos = text.find('public static void main') + if main_pos != -1: + text = text[:main_pos] + '}' + if '}' in text: + text = text[:text.rfind('}')] + '}' + if text.count('{') + 1 == text.count('}'): + text += '\n}' + elif language_type.lower() == 'go': + if '\nfunc main(' in text: + text = text[:text.rfind('func main(')] + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'cpp': + if '\nint main()' in text: + text = text[:text.rfind('int main()')] + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'js': + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'rust': + if '}' in text: + text = text[:text.rfind('}')] + '}' + + return text diff --git a/opencompass/datasets/PMMEval/mgsm.py b/opencompass/datasets/PMMEval/mgsm.py new file mode 100755 index 00000000..da177ba5 --- /dev/null +++ b/opencompass/datasets/PMMEval/mgsm.py @@ -0,0 +1,79 @@ +import json +import os +import re + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + + +def _get_last_digit(s): + _PAT_LAST_DIGIT = re.compile( + r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)' # noqa E501 + ) + match = list(_PAT_LAST_DIGIT.finditer(s)) + if match: + last_digit = match[-1].group().replace(',', '').replace( + '+', '').strip().strip('.') + # print(f"The last digit in {s} is {last_digit}") + else: + last_digit = None + # logger.warning(f"No digits found in {s!r}") + return last_digit + + +@LOAD_DATASET.register_module() +class PMMEvalMGSMDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='mgsm', + split=f'test/{lang}') + else: + dataset = list() + filename = os.path.join(data_path, f'mgsm/test/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalMGSMEvaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + num_correct, total = 0, 0 + details = {} + for index, (references_answer, predictions_answer) in enumerate( + zip(references, predictions)): + extracted_answer = _get_last_digit(predictions_answer) + references_answer = references_answer.replace(',', '') + if references_answer == extracted_answer: + is_correct = True + else: + is_correct = False + + num_correct += is_correct + total += 1 + details[str(index)] = { + 'references': references_answer, + 'predictions': predictions_answer, + 'extracted': extracted_answer, + 'correct': is_correct, + } + + accuracy = round(num_correct / total * 100, 2) + final_result = {'accuracy': accuracy, 'details': details} + return final_result diff --git a/opencompass/datasets/PMMEval/mhellaswag.py b/opencompass/datasets/PMMEval/mhellaswag.py new file mode 100755 index 00000000..75e8a5bb --- /dev/null +++ b/opencompass/datasets/PMMEval/mhellaswag.py @@ -0,0 +1,151 @@ +import json +import os +import re +from typing import Tuple + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +langs_dict = { + 'fr': ['La réponse est', 'la réponse est'], + 'en': ['the answer is', 'The answer is'], + 'vi': ['Câu trả lời là', 'câu trả lời là'], + 'ar': ['الجواب هو'], + 'th': ['คำตอบคือ'], + 'zh': ['答案是'], + 'ko': ['답변은'], + 'pt': ['A resposta é'], + 'ja': ['答えは'], + 'es': ['La respuesta es'] +} + + +def extract_choice(gen, lang): + r""" + { + "answer": "A|B|C|D" + } + """ + patterns = [ + r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", + r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", + r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?", + r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]" + ] + for pattern in patterns: + res = re.findall(pattern, gen, flags=re.DOTALL) + if len(res) >= 1: + return res[-1] + + else: + res = None + pattern = langs_dict[lang] + for p in pattern: + if p in gen and p != gen: + res = gen.split(p) + if len(res) > 1 and len(res[-1].strip()) > 0: + res = res[-1].strip()[0] + else: + res = None + break + + temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd'] + if res in temp: + return res + else: + return None + + +def extract_choice_fuzzy(gen, lang): + options = ['A', 'B', 'C', 'D'] # 定义选项 + for option in options: + if option in gen: # 检查选项是否在文本中 + return option # 返回第一个出现的选项 + return None + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_mhellaswag') +def pmmeval_mhellaswag_postprocess(text: str, lang_code: str) -> Tuple[str]: + return text, lang_code + + +@LOAD_DATASET.register_module() +class PMMEvalMHellaswagDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='mhellaswag', + split=f'test/{lang}') + else: + dataset = list() + filename = os.path.join(data_path, f'mhellaswag/test/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalMHellaswagEvaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + all_results = list() + + for (pred, lang), ref in zip(predictions, references): + answer = chr(int(ref) + 65) + choice = extract_choice(pred, lang) + acc = 0 + failed_strict = 0 + failed = 1 + if choice is not None: + failed = 0 + if answer.lower() == choice.lower(): + acc = 1 + else: + acc = 0 + else: + choice = extract_choice_fuzzy(pred, lang) + if choice is None: + acc = 0 + failed_strict = 1 + else: + failed_strict = 0 + if answer.lower() == choice.lower(): + acc = 1 + else: + acc = 0 + + all_results.append({ + 'acc': + float(acc), + 'failed': + float(failed), + 'failed_strict': + float(failed_strict), + 'extracted_answer': + pred if pred else 'no answer', + }) + + final_result = { + 'accuracy': + round( + sum(x['acc'] for x in all_results) / len(all_results) * 100, + 2), + 'details': + all_results + } + + return final_result diff --git a/opencompass/datasets/PMMEval/mifeval.py b/opencompass/datasets/PMMEval/mifeval.py new file mode 100755 index 00000000..a43b3a8b --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval.py @@ -0,0 +1,147 @@ +import json +import os +from typing import Tuple + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.datasets.PMMEval.mifeval_utils import mifeval_class_map +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + + +def test_instruction_following_strict(inp, response, lang_code): + """Tests response to see if instrutions are followed.""" + instruction_list = inp['instruction_id_list'] + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':')) + instruction_fuction_info = mifeval_class_map[instruction_id_0][ + instruction_id_1] + + instruction_function = instruction_fuction_info['function'] + instruction_function_args = dict() + + if instruction_fuction_info['required_lang_code']: + instruction_function_args['lang_code'] = lang_code + for kwarg_dict in inp['kwargs']: + for k, v in kwarg_dict.items(): + if v is None: + continue + instruction_function_args[k] = v + instruction_function_args['input_string'] = response + + if response.strip() and instruction_function( + **instruction_function_args): + is_following_list.append(True) + else: + is_following_list.append(False) + + return 1.0 if all(is_following_list) else 0.0 + + +def test_instruction_following_loose(inp, response, lang_code): + """Tests response for an upper bound for following instructions.""" + r = response.split('\n') + response_remove_first = '\n'.join(r[1:]).strip() + response_remove_last = '\n'.join(r[:-1]).strip() + response_remove_both = '\n'.join(r[1:-1]).strip() + revised_response = response.replace('*', '') + revised_response_remove_first = response_remove_first.replace('*', '') + revised_response_remove_last = response_remove_last.replace('*', '') + revised_response_remove_both = response_remove_both.replace('*', '') + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + instruction_list = inp['instruction_id_list'] + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_id_0, instruction_id_1 = tuple(instruction_id.split(':')) + instruction_fuction_info = mifeval_class_map[instruction_id_0][ + instruction_id_1] + + instruction_function = instruction_fuction_info['function'] + instruction_function_args = dict() + + if instruction_fuction_info['required_lang_code']: + instruction_function_args['lang_code'] = lang_code + for kwarg_dict in inp['kwargs']: + for k, v in kwarg_dict.items(): + instruction_function_args[k] = v + instruction_function_args['input_string'] = response + + is_following = False + for r in all_responses: + if r.strip() and instruction_function(**instruction_function_args): + is_following = True + break + + is_following_list.append(is_following) + + return 1.0 if all(is_following_list) else 0.0 + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_mifeval') +def pmmeval_mifeval_postprocess(text: str, lang_code: str) -> Tuple[str]: + return text, lang_code + + +@LOAD_DATASET.register_module() +class PMMEvalMIFEvalDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='mifeval', + split=f'test/{lang}') + else: + dataset = list() + filename = os.path.join(data_path, f'mifeval/test/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalMIFEvalEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + all_results = list() + for (pred, lang), example in zip(predictions, test_set): + temp_result = { + 'strict_acc': + test_instruction_following_strict(example, pred, lang), + 'loose_acc': + test_instruction_following_loose(example, pred, lang) + } + + all_results.append(temp_result) + + result = { + 'strict_acc': + round( + sum(x['strict_acc'] + for x in all_results) / len(all_results) * 100, 2), + 'loose_acc': + round( + sum(x['loose_acc'] + for x in all_results) / len(all_results) * 100, 2) + } + return result diff --git a/opencompass/datasets/PMMEval/mifeval_utils/__init__.py b/opencompass/datasets/PMMEval/mifeval_utils/__init__.py new file mode 100755 index 00000000..97865f16 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/__init__.py @@ -0,0 +1,17 @@ +from .combination_checker import combination_checker +from .detectable_content_checker import detectable_content_checker +from .detectable_format_checker import detectable_format_checker +from .keywords_checker import keywords_checker +from .length_constraints_checker import length_constraints_checker +from .punctuation_checker import punctuation_checker +from .startend_checker import startend_checker + +mifeval_class_map = { + 'combination': combination_checker, + 'detectable_content': detectable_content_checker, + 'detectable_format': detectable_format_checker, + 'keywords': keywords_checker, + 'length_constraints': length_constraints_checker, + 'punctuation': punctuation_checker, + 'startend': startend_checker +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py new file mode 100755 index 00000000..2dc48083 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py @@ -0,0 +1,32 @@ +def repeat_prompt_checker(input_string: str, prompt_to_repeat: str, **kwargs): + if input_string.strip().lower().startswith( + prompt_to_repeat.strip().lower()): + return True + return False + + +def two_responses_checker(input_string: str, **kwargs): + valid_responses = list() + responses = input_string.split('******') + for index, response in enumerate(responses): + if not response.strip(): + if index != 0 and index != len(responses) - 1: + return False + else: + valid_responses.append(response) + return (len(valid_responses) == 2 + and valid_responses[0].strip() != valid_responses[1].strip()) + + +combination_checker = { + 'repeat_prompt': { + 'function': repeat_prompt_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, + 'two_responses': { + 'function': two_responses_checker, + 'required_lang_code': False, + 'num_of_params': 1 + } +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py new file mode 100755 index 00000000..1a141d83 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py @@ -0,0 +1,30 @@ +import re + + +def number_placeholders_checker(input_string: str, num_placeholders: int, + **kwargs): + placeholders = re.findall(r'\[.*?\]', input_string) + return len(placeholders) >= num_placeholders + + +def postscript_checker(input_string: str, postscript_marker: str, **kwargs): + input_string = input_string.lower() + postscript_pattern = r'\s*' + postscript_marker.lower() + r'.*$' + postscript = re.findall(postscript_pattern, + input_string, + flags=re.MULTILINE) + return True if postscript else False + + +detectable_content_checker = { + 'number_placeholders': { + 'function': number_placeholders_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, + 'postscript': { + 'function': postscript_checker, + 'required_lang_code': False, + 'num_of_params': 2 + } +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py new file mode 100755 index 00000000..7b540a07 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py @@ -0,0 +1,122 @@ +import json +import re + + +def removeprefix(s, prefix): + if s.startswith(prefix): + return s[len(prefix):] + else: + return s + + +def removesuffix(s, suffix): + if s.endswith(suffix): + return s[:-len(suffix)] + else: + return s + + +constrained_response = { + 'ar': ['إجابتي هي نعم.', 'إجابتي هي لا.', 'إجابتي هي ربما.'], + 'es': + ['Mi respuesta es sí.', 'Mi respuesta es no.', 'Mi respuesta es tal vez.'], + 'fr': [ + 'Ma réponse est oui.', 'Ma réponse est non.', + 'Ma réponse est peut-être.' + ], + 'ja': ['私の答えははいです。', '私の答えはいいえです。', '私の答えはたぶんです。'], + 'ko': ['제 대답은 예입니다.', '제 대답은 아닙니다.', '제 대답은 아마도입니다.'], + 'pt': [ + 'Minha resposta é sim.', 'Minha resposta é não.', + 'Minha resposta é talvez.' + ], + 'th': ['คำตอบของฉันคือใช่', 'คำตอบของฉันคือไม่', 'คำตอบของฉันคืออาจจะ'], + 'vi': [ + 'Câu trả lời của tôi là có.', 'Câu trả lời của tôi là không.', + 'Câu trả lời của tôi là có thể.' + ], + 'en': ['My answer is yes.', 'My answer is no.', 'My answer is maybe.'], + 'zh': ['我的答案是是。', '我的答案是否。', '我的答案是不确定。'] +} + + +def constrained_response_checker(input_string: str, lang_code: str, **kwargs): + allowable_responses = constrained_response[lang_code] + return any(response in input_string for response in allowable_responses) + + +def number_bullet_lists_checker(input_string: str, num_bullets: int, **kwargs): + bullet_lists = re.findall(r'^\s*\*[^\*].*$', + input_string, + flags=re.MULTILINE) + bullet_lists_2 = re.findall(r'^\s*-.*$', input_string, flags=re.MULTILINE) + num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + return num_bullet_lists == num_bullets + + +def number_highlighted_sections_checker(input_string: str, num_highlights: int, + **kwargs): + temp_num_highlights = 0 + highlights = re.findall(r'\*[^\n\*]*\*', input_string) + double_highlights = re.findall(r'\*\*[^\n\*]*\*\*', input_string) + for highlight in highlights: + if highlight.strip('*').strip(): + temp_num_highlights += 1 + for highlight in double_highlights: + if removesuffix(removeprefix(highlight, '**'), '**').strip(): + temp_num_highlights += 1 + + return temp_num_highlights >= num_highlights + + +def title_checker(input_string: str, **kwargs): + pattern = r'<<[^\n]+>>' + re_pattern = re.compile(pattern) + titles = re.findall(re_pattern, input_string) + + for title in titles: + if title.lstrip('<').rstrip('>').strip(): + return True + return False + + +def json_format_checker(input_string: str, **kwargs): + value = (removesuffix( + removeprefix( + removeprefix( + removeprefix(removeprefix(input_string.strip(), '```json'), + '```Json'), '```JSON'), '```'), '```').strip()) + try: + json.loads(value) + except ValueError as e: # noqa F841 + return False + return True + + +detectable_format_checker = { + 'constrained_response': { + 'function': constrained_response_checker, + 'required_lang_code': True, + 'num_of_params': 2 + }, + 'json_format': { + 'function': json_format_checker, + 'required_lang_code': False, + 'num_of_params': 1 + }, + 'number_bullet_lists': { + 'function': number_bullet_lists_checker, + 'required_lang_code': False, + 'num_of_parmas': 2 + }, + 'number_highlighted_sections': { + 'function': number_highlighted_sections_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, + 'title': { + 'function': title_checker, + 'required_lang_code': False, + 'num_of_params': 1 + } +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py new file mode 100755 index 00000000..ba17e66a --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py @@ -0,0 +1,12 @@ +def forbidden_words_checker(input_string: str, forbidden_words: list, + **kwargs): + return not any(word in input_string for word in forbidden_words) + + +keywords_checker = { + 'forbidden_words': { + 'function': forbidden_words_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py new file mode 100755 index 00000000..2b26aac4 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py @@ -0,0 +1,93 @@ +import re + + +def nth_paragraph_first_word_checker(input_string: str, num_paragraphs: int, + nth_paragraph: int, first_word: str, + lang_code: str, **kwargs): + paragraphs = re.split(r'\n\n', input_string) + paragraphs = list(paragraph.strip() for paragraph in paragraphs + if paragraph.strip() != '') + + if len(paragraphs) < num_paragraphs: + return False + + if len(paragraphs) < nth_paragraph: + return False + + paragraph = paragraphs[nth_paragraph - 1].strip() + + first_word = '' + + if paragraph.lower().startswith(first_word.lower()): + return True + else: + return False + + +def number_paragraphs_checker(input_string: str, num_paragraphs: int, + **kwargs): + paragraphs = re.split(r'\s?\*\*\*\s?', input_string) + paragraphs = list(paragraph.strip() for paragraph in paragraphs + if paragraph.strip() != '') + return len(paragraphs) == num_paragraphs + + +def number_sentences_checker(input_string: str, relation: str, + num_sentences: int, lang_code: str, **kwargs): + sentences = list(x.strip() for x in input_string.strip().split('\n')) + sentences = list(x for x in sentences if x != '') + + if relation == 'less than': + if len(sentences) <= num_sentences: + return True + else: + return False + elif relation == 'at least': + if len(sentences) >= num_sentences: + return True + else: + return False + + +def number_words_checker(input_string: str, relation: str, num_words: int, + lang_code: str, **kwargs): + if lang_code in ['en', 'es', 'fr', 'in', 'pt', 'ru', 'vi']: + words = input_string.split() + words = list(x for x in words if x != '') + else: + words = ''.join(input_string.split()) + + if relation == 'less than': + if len(words) <= num_words: + return True + else: + return False + elif relation == 'at least': + if len(words) >= num_words: + return True + else: + return False + + +length_constraints_checker = { + 'nth_paragraph_first_word': { + 'function': nth_paragraph_first_word_checker, + 'required_lang_code': True, + 'num_of_params': 5 + }, + 'number_paragraphs': { + 'function': number_paragraphs_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, + 'number_sentences': { + 'function': number_sentences_checker, + 'required_lang_code': True, + 'num_of_params': 3 + }, + 'number_words': { + 'function': number_words_checker, + 'required_lang_code': True, + 'num_of_params': 4 + } +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py new file mode 100755 index 00000000..53ce42f5 --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py @@ -0,0 +1,30 @@ +import re + +comma_unicode = { + 'ar': re.compile(r'[\u060C]'), + 'es': re.compile(r'[,\uFF0C]'), + 'fr': re.compile(r'[,\u2026]'), + 'ja': re.compile(r'[,\u3001]'), + 'ko': re.compile(r'[,]'), + 'pt': re.compile(r'[,\uFF0C]'), + 'th': re.compile(r'[\u0E25]'), + 'vi': re.compile(r'[,\uFF0C]'), + 'en': re.compile(r'[,]'), + 'zh': re.compile(r'[,,]') +} + + +def no_comma_checker(input_string: str, lang_code: str, **kwargs): + if len(comma_unicode[lang_code].findall(input_string)) > 0: + return False + else: + return True + + +punctuation_checker = { + 'no_comma': { + 'function': no_comma_checker, + 'required_lang_code': True, + 'num_of_params': 2 + } +} diff --git a/opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py b/opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py new file mode 100755 index 00000000..c3e2b2cd --- /dev/null +++ b/opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py @@ -0,0 +1,38 @@ +def end_checker_checker(input_string: str, end_phrase: str, **kwargs): + if input_string.strip().endswith(end_phrase): + return True + else: + return False + + +def quotation_checker(input_string: str, lang_code: str, **kwargs): + input_string = input_string.strip() + if input_string.startswith('"') and input_string.endswith('"'): + return True + elif lang_code in [ + 'ar', 'es', 'fr', 'pt', 'ru' + ] and input_string.startswith('«') and input_string.endswith('»'): + return True + elif lang_code in [ + 'ar', 'es', 'fr', 'ko', 'pt', 'th', 'vi', 'zh' + ] and input_string.startswith('“') and input_string.endswith('”'): + return True + elif lang_code == 'ja' and input_string.startswith( + '『') and input_string.endswith('』'): + return True + else: + return False + + +startend_checker = { + 'end_checker': { + 'function': end_checker_checker, + 'required_lang_code': False, + 'num_of_params': 2 + }, + 'quotation': { + 'function': quotation_checker, + 'required_lang_code': True, + 'num_of_params': 2 + } +} diff --git a/opencompass/datasets/PMMEval/mlogiqa.py b/opencompass/datasets/PMMEval/mlogiqa.py new file mode 100755 index 00000000..089759c6 --- /dev/null +++ b/opencompass/datasets/PMMEval/mlogiqa.py @@ -0,0 +1,152 @@ +import json +import os +import re +from typing import Tuple + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +langs_dict = { + 'fr': ['La réponse est', 'la réponse est'], + 'en': ['the answer is', 'The answer is'], + 'vi': ['Câu trả lời là', 'câu trả lời là'], + 'ar': ['الجواب هو'], + 'th': ['คำตอบคือ'], + 'zh': ['答案是'], + 'ko': ['답변은'], + 'pt': ['A resposta é'], + 'ja': ['答えは'], + 'es': ['La respuesta es'] +} + + +def extract_choice(gen, lang): + r""" + { + "answer": "A|B|C|D" + } + """ + patterns = [ + r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", + r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", + r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?", + r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]" + ] + for pattern in patterns: + res = re.findall(pattern, gen, flags=re.DOTALL) + if len(res) >= 1: + return res[-1] + + else: + res = None + pattern = langs_dict[lang] + for p in pattern: + if p in gen and p != gen: + res = gen.split(p) + if len(res) > 1 and len(res[-1].strip()) > 0: + res = res[-1].strip()[0] + else: + res = None + + break + + temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd'] + if res in temp: + return res + else: + return None + + +def extract_choice_fuzzy(gen): + options = ['A', 'B', 'C', 'D'] + for option in options: + if option in gen: + return option + return None + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_mlogiqa') +def pmmeval_mlogiqa_postprocess(text: str, lang_code: str) -> Tuple[str]: + return text, lang_code + + +@LOAD_DATASET.register_module() +class PMMEvalMLogiQADataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str): + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='mlogiqa', + split=f'test/{lang}') + else: + dataset = list() + filename = os.path.join(data_path, f'mlogiqa/test/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalMLogiQAEvaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + all_results = list() + + for (pred, lang), ref in zip(predictions, references): + answer = chr(int(ref) + 65) + pred = extract_choice(pred, lang) + acc = 0 + failed_strict = 0 + failed = 1 + if pred is not None: + failed = 0 + if answer.lower() == pred.lower(): + acc = 1 + else: + acc = 0 + else: + pred_fuzzy = extract_choice_fuzzy(pred) + if pred_fuzzy is None: + acc = 0 + failed_strict = 1 + else: + failed_strict = 0 + if answer.lower() == pred_fuzzy.lower(): + acc = 1 + else: + acc = 0 + + all_results.append({ + 'acc': + float(acc), + 'failed': + float(failed), + 'failed_strict': + float(failed_strict), + 'extracted_answer': + pred if pred else 'no answer', + }) + + final_result = { + 'accuracy': + round( + sum(x['acc'] for x in all_results) / len(all_results) * 100, + 2), + 'details': + all_results + } + + return final_result diff --git a/opencompass/datasets/PMMEval/mmmlu.py b/opencompass/datasets/PMMEval/mmmlu.py new file mode 100755 index 00000000..a71ab5c5 --- /dev/null +++ b/opencompass/datasets/PMMEval/mmmlu.py @@ -0,0 +1,157 @@ +import json +import os +import re +from typing import Tuple + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +langs_dict = { + 'FR-FR': ['La réponse est', 'la réponse est'], + 'EN-US': ['the answer is', 'The answer is'], + 'VI-VT': ['Câu trả lời là', 'câu trả lời là'], + 'AR-XY': ['الجواب هو'], + 'TH-TL': ['คำตอบคือ'], + 'ZH-CN': ['答案是'], + 'KO-KR': ['답변은'], + 'PT-BR': ['A resposta é'], + 'JA-JP': ['答えは'], + 'ES-LA': ['La respuesta es'] +} + + +def extract_choice(gen, lang): + r""" + { + "answer": "A|B|C|D" + } + """ + patterns = [ + r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", + r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", + r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?", + r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]" + ] + for pattern in patterns: + res = re.findall(pattern, gen, flags=re.DOTALL) + if len(res) >= 1: + return res[-1] + + else: + res = None + pattern = langs_dict[lang] + for p in pattern: + if p in gen and p != gen: + res = gen.split(p) + if len(res) > 1 and len(res[-1].strip()) > 0: + res = res[-1].strip()[0] + else: + res = None + break + + temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd'] + if res in temp: + return res + else: + return None + + +def extract_choice_fuzzy(gen): + options = ['A', 'B', 'C', 'D'] + for option in options: + if option in gen: + return option + return None + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_mmmlu') +def pmmeval_mmmlu_postprocess(text: str, lang_code: str) -> Tuple[str]: + return text, lang_code + + +@LOAD_DATASET.register_module() +class PMMEvalMMMLUDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str, difficulty: str): + assert difficulty in [ + 'easy', 'hard', 'all' + ], '`difficulty` should be one choice among "easy", "hard", and "all"!' + data_path = get_data_path(path) + + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + dataset_list = list() + from modelscope import MsDataset + if difficulty == 'easy' or difficulty == 'all': + dataset_list.append( + MsDataset.load(dataset_name=data_path, + subset_name='mmmlu', + split=f'easy/test/mmlu_{lang}')) + if difficulty == 'hard' or difficulty == 'all': + dataset_list.append( + MsDataset.load(dataset_name=data_path, + subset_name='mmmlu', + split=f'hard/test/mmlu_{lang}')) + # TODO: conbine two datasets + dataset = dataset_list[0] + dataset_list[1] if len( + dataset_list) == 2 else dataset_list[0] + else: + dataset = list() + if difficulty == 'easy' or difficulty == 'all': + filename = os.path.join(data_path, + f'mmmlu/easy/test/mmlu_{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + if difficulty == 'hard' or difficulty == 'all': + filename = os.path.join(data_path, + f'mmmlu/hard/test/mmlu_{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalMMMLUEvaluator(BaseEvaluator): + + def score(self, predictions, references): + all_results = list() + for (pred, lang), ref in zip(predictions, references): + answer = extract_choice(pred, lang) + if answer is None: + answer = extract_choice_fuzzy(pred) + if answer is None: + acc = 0.0 + failed = 1.0 + else: + acc = 1.0 if ref.lower() == answer.lower() else 0.0 + failed = 0.0 + + all_results.append({ + 'acc': + acc, + 'failed': + failed, + 'extracted_answer': + pred if pred else 'no answer' + }) + + final_result = { + 'accuracy': + round( + sum(x['acc'] for x in all_results) / len(all_results) * 100, + 2), + 'details': + all_results + } + + return final_result diff --git a/opencompass/datasets/PMMEval/xnli.py b/opencompass/datasets/PMMEval/xnli.py new file mode 100755 index 00000000..33afa411 --- /dev/null +++ b/opencompass/datasets/PMMEval/xnli.py @@ -0,0 +1,150 @@ +import json +import os +import re +from typing import Tuple + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +langs_dict = { + 'fr': ['La réponse est', 'la réponse est'], + 'en': ['the answer is', 'The answer is'], + 'vi': ['Câu trả lời là', 'câu trả lời là'], + 'ar': ['الجواب هو'], + 'th': ['คำตอบคือ'], + 'zh': ['答案是'], + 'ko': ['답변은'], + 'pt': ['A resposta é'], + 'ja': ['答えは'], + 'id': ['Jawaban adalah', 'jawaban adalah'], + 'es': ['La respuesta es'] +} + + +def extract_choice(gen, lang): + r""" + { + "answer": "A|B|C|D" + } + """ + patterns = [ + r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", + r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", + r"\"answer\"\s*:\s*\"?(A|B|C|D)\"?", + r"[\'\"]answer[\'\"]\s*:\s*[\'\"](A|B|C|D)[\'\"]" + ] + for pattern in patterns: + res = re.findall(pattern, gen, flags=re.DOTALL) + if len(res) >= 1: + return res[-1] + + else: + res = None + pattern = langs_dict[lang] + for p in pattern: + if p in gen and p != gen: + res = gen.split(p) + if len(res) > 1 and len(res[-1].strip()) > 0: + res = res[-1].strip()[0] + else: + res = None + break + + temp = ['A', 'B', 'C', 'D', 'a', 'b', 'c', 'd'] + if res in temp: + return res + else: + return None + + +def extract_choice_fuzzy(gen, lang): + options = ['A', 'B', 'C', 'D'] # 定义选项 + for option in options: + if option in gen: # 检查选项是否在文本中 + return option # 返回第一个出现的选项 + return None + + +@TEXT_POSTPROCESSORS.register_module('pmmeval_xnli') +def pmmeval_xnli_postprocess(text: str, lang_code: str) -> Tuple[str]: + return text, lang_code + + +@LOAD_DATASET.register_module() +class PMMEvalXNLIDataset(BaseDataset): + + @staticmethod + def load(path: str, lang: str): + data_path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(dataset_name=data_path, + subset_name='xnli', + split=f'test/{lang}') + else: + dataset = list() + filename = os.path.join(data_path, f'xnli/test/{lang}.jsonl') + with open(filename, mode='r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + dataset.append(line) + dataset = Dataset.from_list(dataset) + + return dataset + + +class PMMEvalXNLIEvaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + all_results = list() + + for (pred, lang), ref in zip(predictions, references): + choice = extract_choice(pred, lang) + acc = 0 + failed_strict = 0 + failed = 1 + if choice is not None: + failed = 0 + if ref.lower() == choice.lower(): + acc = 1 + else: + acc = 0 + else: + choice = extract_choice_fuzzy(pred, lang) + if choice is None: + acc = 0 + failed_strict = 1 + else: + failed_strict = 0 + if ref.lower() == choice.lower(): + acc = 1 + else: + acc = 0 + + all_results.append({ + 'acc': + float(acc), + 'failed': + float(failed), + 'failed_strict': + float(failed_strict), + 'extracted_answer': + choice if choice else 'no answer', + }) + + final_result = { + 'accuracy': + round( + sum(x['acc'] for x in all_results) / len(all_results) * 100, + 2), + 'details': + all_results + } + + return final_result diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index b0d1b93f..d4241937 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -343,6 +343,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/babilong/data/", }, + "P-MMEval": { + "ms_id": "", + "hf_id": "", + "local": "./data/P-MMEval/", + }, "opencompass/arc_prize_public_evaluation": { "ms_id": "", "hf_id": "", @@ -530,7 +535,7 @@ DATASETS_URL = { "/cmo": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", "md5": "fad52c81290506a8ca74f46b5400d8fc", - }, + }, "/nq-open": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", "md5": "a340521e5c9ec591227dcb367f718b25", @@ -566,5 +571,9 @@ DATASETS_URL = { "/arc_prize_public_evaluation": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", "md5": "367a33977651496efddba7670009807e" + }, + "P-MMEval": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", + "md5": "589c8be1551a609d94231f1410cf22eb", } } From 06ab27861ec753106a7ec4508bc37da9723b045d Mon Sep 17 00:00:00 2001 From: liushz Date: Thu, 28 Nov 2024 11:53:36 +0800 Subject: [PATCH 09/19] [Fix] Fix pmmeval_gen config (#1719) * Update with PMMEval * Update * Update __init__.py * Fix Bugs * Delete .pre-commit-config.yaml * Pull merge * Fix pmmeval_gen config --------- Co-authored-by: wanyu Co-authored-by: wanyu2018umac <42405907+wanyu2018umac@users.noreply.github.com> --- opencompass/configs/datasets/PMMEval/pmmeval_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/configs/datasets/PMMEval/pmmeval_gen.py b/opencompass/configs/datasets/PMMEval/pmmeval_gen.py index 9fce1773..46225a32 100755 --- a/opencompass/configs/datasets/PMMEval/pmmeval_gen.py +++ b/opencompass/configs/datasets/PMMEval/pmmeval_gen.py @@ -2,7 +2,7 @@ from mmengine.config import read_base with read_base(): from .flores_gen_2697d7 import PMMEval_flores_datasets - from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets + from .humanevalxl_gen_bdec92 import PMMEval_HumanEvalXL_datasets from .mgsm_gen_679720 import PMMEval_MGSM_datasets from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets From c437135fad6ea021d996d0b32506601dc7f8d69d Mon Sep 17 00:00:00 2001 From: liushz Date: Thu, 28 Nov 2024 19:16:07 +0800 Subject: [PATCH 10/19] [Feature] Add Openai Simpleqa dataset (#1720) * Add Openai SimpleQA dataset * Add Openai SimpleQA dataset * Add Openai SimpleQA dataset * Update eval_simpleqa.py --------- Co-authored-by: Linchen Xiao --- configs/eval_simpleqa.py | 45 +++++ .../configs/datasets/SimpleQA/README.md | 10 + .../configs/datasets/SimpleQA/simpleqa_gen.py | 4 + .../datasets/SimpleQA/simpleqa_gen_0283c3.py | 133 +++++++++++++ opencompass/configs/summarizers/simpleqa.py | 7 + opencompass/datasets/__init__.py | 1 + opencompass/datasets/simpleqa.py | 188 ++++++++++++++++++ opencompass/utils/datasets_info.py | 9 + 8 files changed, 397 insertions(+) create mode 100644 configs/eval_simpleqa.py create mode 100644 opencompass/configs/datasets/SimpleQA/README.md create mode 100644 opencompass/configs/datasets/SimpleQA/simpleqa_gen.py create mode 100644 opencompass/configs/datasets/SimpleQA/simpleqa_gen_0283c3.py create mode 100644 opencompass/configs/summarizers/simpleqa.py create mode 100644 opencompass/datasets/simpleqa.py diff --git a/configs/eval_simpleqa.py b/configs/eval_simpleqa.py new file mode 100644 index 00000000..85cd0fcd --- /dev/null +++ b/configs/eval_simpleqa.py @@ -0,0 +1,45 @@ +# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py +from mmengine.config import read_base +from opencompass.partitioners import NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.summarizers import DefaultSubjectiveSummarizer + + +with read_base(): + from opencompass.configs.datasets.SimpleQA.simpleqa_gen import simpleqa_datasets + from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model + +models = gpt_4o_2024_05_13_model # model for generation +judge_models = gpt_4o_2024_05_13_model # model for evaluation + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) +summarizer = dict(type=DefaultSubjectiveSummarizer) + +# -------------Inferen Stage ---------------------------------------- + +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=8, + task=dict(type=OpenICLInferTask) + ), +) + +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=[gpt_4o_2024_05_13_model], + judge_models=[gpt_4o_2024_05_13_model], + ), + runner=dict(type=LocalRunner, + max_num_workers=256, + task=dict(type=SubjectiveEvalTask)), +) diff --git a/opencompass/configs/datasets/SimpleQA/README.md b/opencompass/configs/datasets/SimpleQA/README.md new file mode 100644 index 00000000..498817ce --- /dev/null +++ b/opencompass/configs/datasets/SimpleQA/README.md @@ -0,0 +1,10 @@ +# OpenCompass SimpleQA dataset config for evaluation + +## 1. Introduction + +SimpleQA is a benchmark that evaluates the ability of language models to answer short, fact-seeking questions by OpenAI. +The original site is https://github.com/openai/simple-evals. + +## 2. How to use + +Please refer to the demo evaluation script `/opencompass/configs/mine/simpleqa_eval.py`. diff --git a/opencompass/configs/datasets/SimpleQA/simpleqa_gen.py b/opencompass/configs/datasets/SimpleQA/simpleqa_gen.py new file mode 100644 index 00000000..4ff277ed --- /dev/null +++ b/opencompass/configs/datasets/SimpleQA/simpleqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .simpleqa_gen_0283c3 import simpleqa_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SimpleQA/simpleqa_gen_0283c3.py b/opencompass/configs/datasets/SimpleQA/simpleqa_gen_0283c3.py new file mode 100644 index 00000000..3f4883f8 --- /dev/null +++ b/opencompass/configs/datasets/SimpleQA/simpleqa_gen_0283c3.py @@ -0,0 +1,133 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import SimpleQADataset, simpleqa_postprocess + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {problem} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +simpleqa_reader_cfg = dict(input_columns=['problem'], output_column='answer') + +simpleqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step:"), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048)) + +simpleqa_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dict_postprocessor=dict(type=simpleqa_postprocess), + ), + pred_role='BOT', +) + +simpleqa_datasets = [ + dict( + abbr='simpleqa', + type=SimpleQADataset, + path='opencompass/simpleqa', + reader_cfg=simpleqa_reader_cfg, + infer_cfg=simpleqa_infer_cfg, + eval_cfg=simpleqa_eval_cfg, + mode='singlescore', + ) +] diff --git a/opencompass/configs/summarizers/simpleqa.py b/opencompass/configs/summarizers/simpleqa.py new file mode 100644 index 00000000..b22d0121 --- /dev/null +++ b/opencompass/configs/summarizers/simpleqa.py @@ -0,0 +1,7 @@ +summarizer = dict( + dataset_abbrs=[ + ['simpleqa', 'accuracy_given_attempted'], + ['simpleqa', 'f1'], + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index ddb70b12..20e6fecb 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -108,6 +108,7 @@ from .ruler import * # noqa: F401, F403 from .safety import * # noqa: F401, F403 from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403 from .scicode import * # noqa: F401, F403 +from .simpleqa import * # noqa: F401, F403 from .siqa import * # noqa: F401, F403 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403 from .storycloze import * # noqa: F401, F403 diff --git a/opencompass/datasets/simpleqa.py b/opencompass/datasets/simpleqa.py new file mode 100644 index 00000000..bf355f50 --- /dev/null +++ b/opencompass/datasets/simpleqa.py @@ -0,0 +1,188 @@ +# Edited from the official SimpleQA config: https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py # noqa E501 +import random +import re + +import pandas +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class SimpleQADataset(BaseDataset): + + @staticmethod + def load(path: str, + num_examples: int | None = None, + n_repeats: int = 1, + **kwargs): + path = get_data_path(path) + dataset = DatasetDict() + df = pandas.read_csv(path) + examples = [row.to_dict() for _, row in df.iterrows()] + if num_examples: + assert n_repeats == 1, \ + 'n_repeats only supported when max_examples = None' + rng = random.Random(0) + examples = rng.sample(examples, num_examples) + examples = examples * n_repeats + dataset['train'] = Dataset.from_list(examples) + dataset['test'] = Dataset.from_list(examples) + return dataset + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {gold_answer} +Predicted answer: {answer} +``` +""".strip() # noqa E501 + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +def get_final_results(judged_answers, references, origial_responses): + count = 0 + is_correct_count = 0 + is_incorrect_count = 0 + is_not_attempted_count = 0 + details = [] + for i, j, k in zip(judged_answers, references, origial_responses): + match = re.search(r'(A|B|C)', i) + grade_letter = match.group( + 0) if match else 'C' # Default to "NOT_ATTEMPTED" if no match + detail = { + 'pred': k, + 'ref': j, + 'origin_grade_response': i, + 'grade_letter': grade_letter, + 'correct': False + } + count += 1 + if grade_letter == 'A': + is_correct_count += 1 + detail['correct'] = True + elif grade_letter == 'B': + is_incorrect_count += 1 + else: + is_not_attempted_count += 1 + details.append(detail) + + is_correct = is_correct_count / count + is_incorrect = is_incorrect_count / count + # is_not_attempted = is_not_attempted_count / count + is_given_attempted = is_correct + is_incorrect + accuracy_given_attempted = is_correct / is_given_attempted \ + if is_given_attempted > 0 else 0 + f1 = 2 * accuracy_given_attempted * is_correct / ( + accuracy_given_attempted + is_correct) if (accuracy_given_attempted + + is_correct) > 0 else 0 + result = { + 'accuracy_given_attempted': accuracy_given_attempted, + 'f1': f1, + 'details': details + } + return result + + +def _single_simpleqa_postprocess(judgement: str): + match = re.search(r'(A|B|C)', judgement) + grade_letter = match.group( + 0) if match else 'C' # Default to "NOT_ATTEMPTED" if no match + return grade_letter + + +def simpleqa_postprocess( + output: dict, + output_path: str, +) -> dict: + judged_answers = [] + origial_responses = [] + references = [] + for k, v in output.items(): + origial_responses.append(v['prediction']) + processed_judge = _single_simpleqa_postprocess(v['prediction']) + if processed_judge is not None: + judged_answers.append(processed_judge) + references.append(v['gold']) + results = get_final_results(judged_answers, references, origial_responses) + results['details'] = output + return results diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index d4241937..9ecf1176 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -352,6 +352,11 @@ DATASETS_MAPPING = { "ms_id": "", "hf_id": "", "local": "./data/arc_prize_public_evaluation", + }, + "opencompass/simpleqa": { + "ms_id": "", + "hf_id": "", + "local": "./data/simpleqa/simple_qa_test_set.csv", } } @@ -368,6 +373,10 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip", "md5": "a776af1220e1826fd0608eda1bc4425e", }, + "/simpleqa": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip", + "md5": "1d83fc2e15798d39cb265c9a3cb5195a", + }, "/gpqa/": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip", "md5": "2e9657959030a765916f1f2aca29140d", From b063779034ea22c15aa38f4ee3f31cff41bf3987 Mon Sep 17 00:00:00 2001 From: liushz Date: Thu, 28 Nov 2024 20:55:46 +0800 Subject: [PATCH 11/19] [Fix] Update P-MMEVAL OSS data (#1722) * Update with PMMEval * Update * Update __init__.py * Fix Bugs * Delete .pre-commit-config.yaml * Pull merge * Fix pmmeval_gen config * Update P-MMEVAL data --------- Co-authored-by: wanyu Co-authored-by: wanyu2018umac <42405907+wanyu2018umac@users.noreply.github.com> --- opencompass/utils/datasets_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 9ecf1176..3011a20e 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -583,6 +583,6 @@ DATASETS_URL = { }, "P-MMEval": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", - "md5": "589c8be1551a609d94231f1410cf22eb", + "md5": "09e401e6229a50647b9e13c429e634d1", } } From fe6d76fb13030680fa023940139ee11fc825e622 Mon Sep 17 00:00:00 2001 From: Junnan Liu Date: Sat, 30 Nov 2024 00:07:19 +0800 Subject: [PATCH 12/19] [Feature] Support LiveMathBench (#1727) --- .../configs/datasets/livemathbench/README.md | 74 ++++ .../livemathbench/livemathbench_gen.py | 4 + .../livemathbench/livemathbench_gen_caed8f.py | 49 +++ .../datasets/livemathbench/__init__.py | 2 + .../datasets/livemathbench/livemathbench.py | 324 ++++++++++++++++++ opencompass/datasets/livemathbench/prompts.py | 70 ++++ 6 files changed, 523 insertions(+) create mode 100644 opencompass/configs/datasets/livemathbench/README.md create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_gen.py create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py create mode 100644 opencompass/datasets/livemathbench/__init__.py create mode 100644 opencompass/datasets/livemathbench/livemathbench.py create mode 100644 opencompass/datasets/livemathbench/prompts.py diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md new file mode 100644 index 00000000..fd506c0f --- /dev/null +++ b/opencompass/configs/datasets/livemathbench/README.md @@ -0,0 +1,74 @@ +# LiveMathBench + +## Details of Datsets + +| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | +| -- | -- | -- | -- | -- | -- | +| AIMC | cn | 46 | 0 | 0 | 0 | +| AIMC | en | 46 | 0 | 0 | 0 | +| CEE | cn | 28 | 9 | 13 | 3 | +| CEE | en | 28 | 9 | 13 | 3 | +| CMO | cn | 0 | 0 | 0 | 18 | +| CMO | en | 0 | 0 | 0 | 18 | + + +## How to use + + +```python +from mmengine.config import read_base + +with read_base(): + from opencompass.datasets.livemathbench import livemathbench_datasets + +livemathbench_datasets[0].update( + { + 'path': '/path/to/data/dir', + 'k': 'k@pass', # the max value of k in k@pass + 'n': 'number of runs', # number of runs + } +) +livemathbench_datasets[0]['eval_cfg']['evaluator'].update( + { + 'model_name': 'Qwen/Qwen2.5-72B-Instruct', + 'url': [ + 'http://0.0.0.0:23333/v1', + '...' + ] # set url of evaluation models + } +) + +``` + +> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested. + +```python +livemathbench_datasets[0]['eval_cfg']['evaluator'].update( + { + 'model_name': 'Qwen/Qwen2.5-72B-Instruct', + 'url': [ + 'http://0.0.0.0:23333/v1', + '...' + ], # set url of evaluation models + + # for LLM-based extraction + 'use_extract_model': True, + 'post_model_name': 'oc-extractor', + 'post_url': [ + 'http://0.0.0.0:21006/v1, + '...' + ] + } +) +``` + +## Output Samples + +| dataset | version | metric | mode | Qwen2.5-72B-Instruct | +|----- | ----- | ----- | ----- | -----| +| LiveMathBench | caed8f | 1@pass | gen | 26.07 | +| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx | +| LiveMathBench | caed8f | 2@pass | gen | xx.xx | +| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx | +| LiveMathBench | caed8f | pass-rate | gen | xx.xx | + diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen.py new file mode 100644 index 00000000..c0bd6477 --- /dev/null +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livemathbench_gen_caed8f import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py new file mode 100644 index 00000000..d0f73023 --- /dev/null +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_reader_cfg = dict( + input_columns=['prompt'], + output_column='answer' +) + +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=2048, + temperature=1.0 + ) +) + +livemathbench_eval_cfg = dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='Qwen/Qwen2.5-72B-Instruct', + url=[] + ) +) + +livemathbench_datasets = [ + dict( + type=LiveMathBenchDataset, + abbr='LiveMathBench', + path='', + k=32, + n=5, + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=livemathbench_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/datasets/livemathbench/__init__.py b/opencompass/datasets/livemathbench/__init__.py new file mode 100644 index 00000000..2da0531f --- /dev/null +++ b/opencompass/datasets/livemathbench/__init__.py @@ -0,0 +1,2 @@ +from .livemathbench import LiveMathBenchDataset # noqa: F401, F403 +from .livemathbench import LiveMathBenchEvaluator # noqa: F401, F403 diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py new file mode 100644 index 00000000..0a4cf4ce --- /dev/null +++ b/opencompass/datasets/livemathbench/livemathbench.py @@ -0,0 +1,324 @@ +import concurrent.futures +import os +import re +from copy import deepcopy +from itertools import product +from typing import Any, Dict, List + +import jsonlines +import numpy as np +from datasets import Dataset + +from opencompass.models import OpenAISDK +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS + +from ..base import BaseDataset +from .prompts import (EXTRACT_PROMPT_CN, EXTRACT_PROMPT_EN, JUDGE_PROMPT_CN, + JUDGE_PROMPT_EN, PROMPT_CN, PROMPT_EN) + + +@LOAD_DATASET.register_module() +class LiveMathBenchDataset(BaseDataset): + dataset_splits = ['AIMC', 'CEE', 'CMO'] + dataset_languages = ['cn', 'en'] + + @staticmethod + def load( + path: str, + k: int, + n: int, + ) -> List[Dict[str, Any]]: + dataset = [] + dataset_info = {} + for split, language in product(LiveMathBenchDataset.dataset_splits, + LiveMathBenchDataset.dataset_languages): + file_path = os.path.join(path, f'{split}_{language}.jsonl') + dataset_info[f'{split}_{language}'] = { + 'single-choice': 0, + 'multiple-choice': 0, + 'fill-in-the-blank': 0, + 'problem-solving': 0 + } + question_type_mapping = { + '单选': 'single-choice', + '多选': 'multiple-choice', + '填空': 'fill-in-the-blank', + '问答': 'problem-solving' + } + with jsonlines.open(file_path, 'r') as file: + for example_idx, example in enumerate(file): + dataset_info[f'{split}_{language}'][ + example['question_type'] if language == 'en' else + question_type_mapping[example['question_type']]] += 1 + + prompt = PROMPT_EN if language == 'en' else PROMPT_CN + example.update({ + 'dataset_key': + f'{split}_{language}_{example_idx}', + 'prompt': + prompt.format(question_type=example['question_type'], + question=example['question'] + + ('' if 'options' not in example else + ' '.join(example['options']))), + 'k': + k, + 'n': + n + }) + for idx in range(k * n): + duplicated_example = deepcopy(example) + duplicated_example.update({'duplicated_idx': idx}) + dataset.append(duplicated_example) + + return Dataset.from_list(dataset) + + +@ICL_EVALUATORS.register_module() +class LiveMathBenchEvaluator(BaseEvaluator): + api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]) + + def __init__(self, + model_name, + url, + with_postprocess=True, + use_extract_model=False, + post_url=[], + post_model_name='', + **kwargs): + if isinstance(url, str): + url = [url] + + self.model = [ + MODELS.build( + dict( + type=OpenAISDK, + path=model_name, + openai_api_base=url, + key='EMPTY', + query_per_second=2, + meta_template=self.api_meta_template, + temperature=kwargs.get('temperature', 0.01), + max_seq_len=kwargs.get('max_tokens', 2048), + )) for url in url + ] + self.with_postprocess = with_postprocess + self.use_extract_model = use_extract_model + self.post_url = post_url + self.post_model_name = post_model_name + + def batch_response(self, models: List[OpenAISDK], + inputs: List[str]) -> List[str]: + batch_num = len(models) + batch_size = (len(inputs) + batch_num - 1) // batch_num + result_responses = [] + + with concurrent.futures.ThreadPoolExecutor( + max_workers=batch_num) as executor: + futures = [ + executor.submit(models[i].generate, + inputs[i * batch_size:(i + 1) * batch_size]) + for i in range(batch_num) + ] + for response in executor.map(lambda f: f.result(), futures): + result_responses.extend(response) + + return result_responses + + def postprocess(self, questions: List[str], predictions: List[str], + question_types: List[str], + languages: List[str]) -> List[str]: + if self.use_extract_model: + assert len(self.post_url) > 0 and self.post_model_name != '' + post_model = [ + MODELS.build( + dict( + type=OpenAISDK, + path=self.post_model_name, + openai_api_base=url, + key='EMPTY', + query_per_second=2, + meta_template=self.api_meta_template, + temperature=0.01, + max_seq_len=1024, + )) for url in self.post_url + ] + + input_prompts = [] + for question, prediction, question_type, language in zip( + questions, predictions, question_types, languages): + prompt = (EXTRACT_PROMPT_EN + if language == 'en' else EXTRACT_PROMPT_CN) + input_prompts.append( + prompt.format(question=question, + response=prediction, + question_type=question_type)) + + result_responses = self.batch_response(post_model, input_prompts) + + return result_responses + + def last_boxed_only_string(string): + idx = string.rfind('\\boxed') + if idx < 0: + idx = string.rfind('\\fbox') + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == '{': + num_left_braces_open += 1 + if string[i] == '}': + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx:right_brace_idx + 1] + + return retval + + def remove_boxed(s): + left = '\\boxed{' + try: + assert s[:len(left)] == left + assert s[-1] == '}' + return s[len(left):-1] + except Exception: + return None + + def extract_boxed_answer(pred_str, strip_double_curly_brace=False): + boxed_str = last_boxed_only_string(pred_str) + if boxed_str is None: + return None + answer = remove_boxed(boxed_str) + if answer is None: + return None + if strip_double_curly_brace: + match = re.match('^\{(.*)\}$', answer) # noqa: W605 + if match: + answer = match.group(1) + return answer + + predictions = [ + extract_boxed_answer(prediction) for prediction in predictions + ] + return predictions + + def extract_boxed_answer(self, text): + match = re.findall(r'\\boxed{(.+?)}', text) + if match: + return match[-1] + + return None + + def score(self, predictions, references, origin_prompt, test_set): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + + questions = test_set['question'] + question_types = test_set['question_type'] + languages = [key.split('_')[1] for key in test_set['dataset_key']] + + if self.with_postprocess: + predictions = self.postprocess(questions, predictions, + question_types, languages) + + inputs = [] + for prediction, reference, question, language in zip( + predictions, references, questions, languages): + prompt = JUDGE_PROMPT_EN if language == 'en' else JUDGE_PROMPT_CN + inputs.append( + prompt.format(answer=prediction, + gold_answer=reference, + question=question)) + result_responses = self.batch_response(self.model, inputs) + results = [ + self.extract_boxed_answer(result) == 'yes' + for result in result_responses + ] + + K = test_set['k'][0] + N = test_set['n'][0] + key2example = {} + + for example, result_response, result, prediction in zip( + test_set, result_responses, results, predictions): + if example['dataset_key'] not in key2example: + key2example[example['dataset_key']] = [] + example.update({ + 'eval_response': result_response, + 'prediction': prediction, + 'correct': result + }) + key2example[example['dataset_key']].append(example) + for key in key2example: + key2example[key] = [ + key2example[key][i * K:(i + 1) * K] for i in range(N) + ] + + count = [] + total_pass_num = [] + details = [] + for key, examples in key2example.items(): + detail = { + 'question': examples[0][0]['question'], + 'answer': examples[0][0]['answer'], + 'responses': [] + } + if_pass_list = [] + for single_run_examples in examples: + detail['responses'].append([]) + if_pass_list.append([]) + for example in single_run_examples: + detail['responses'][-1].append({ + 'prediction': + example['prediction'], + 'eval_response': + example['eval_response'] + }) + if_pass_list[-1].append(1.0 if example['correct'] else 0.0) + + if_pass_list = [ + sorted(if_pass, reverse=True) for if_pass in if_pass_list + ] + if_pass_list = np.array(if_pass_list) + i = 1 + while i <= K: + detail.update({ + f'{i}@pass': + if_pass_list[:, :i].mean(axis=1).mean(axis=0).item(), + f'{i}@pass/std': + if_pass_list[:, :i].mean(axis=1).std(axis=0).item() + }) + i = i * 2 + + count.append(np.ones_like(if_pass_list).sum(axis=1)) + total_pass_num.append(if_pass_list.sum(axis=1)) + + details.append(detail) + + detailed_result = {'details': details} + i = 1 + while i <= K: + detailed_result.update({ + f'{i}@pass': + 100. * np.mean([detail[f'{i}@pass'] for detail in details]), + f'{i}@pass/std': + 100. * np.mean([detail[f'{i}@pass/std'] for detail in details]) + }) + i = i * 2 + detailed_result.update( + {'pass-rate': 100. * np.mean(sum(total_pass_num) / sum(count))}) + + return detailed_result diff --git a/opencompass/datasets/livemathbench/prompts.py b/opencompass/datasets/livemathbench/prompts.py new file mode 100644 index 00000000..540cfaa2 --- /dev/null +++ b/opencompass/datasets/livemathbench/prompts.py @@ -0,0 +1,70 @@ +# flake8: noqa + +EXTRACT_PROMPT_CN = '''你是一个乐于助人的助手,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,同一空白的答案可能有多个,请在同一个 string 中用逗号隔开表示,如 ['sqrt(x) 且 x > 10', '1/2, 1/3', '1/4'] 代表问题包含三小问,第一小问包含取值范围信息,第二小问有两个答案,第三小问有一个答案。 +对于解答题,类似填空题,答案应该是一个答案列表,每小问的答案间用逗号隔开,同样需要注意某些小问答案多个的情况。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +''' + +EXTRACT_PROMPT_EN = '''You are a helpful assistant whose task is to extract precise key answers from given response sentences. You must only provide the extracted key answers without any additional text. +— +I will provide you with a question, a response sentence, and the question type. The response sentence is a reply to the provided question. Using the provided information, you must accurately and precisely identify and extract the expected key answers from the response sentence. Please do not provide subjective opinions about the question. + +For single-choice questions, the answer should be the letter of the option, such as "A". +For multiple-choice questions, the answer should be a list of option letters, such as ["A"] or ["A", "B", "C"]. +For fill-in-the-blank questions, the answer should be a list of answers to fill in the blanks. The number of items in the list should match the number of blanks in the question. If there are multiple answers for the same blank, separate them with a comma within the same string, like ['sqrt(x) and x > 10', '1/2, 1/3', '1/4'], which represents three sub-questions where the first sub-question includes a range, the second sub-question has two answers, and the third sub-question has one answer. +For problem-solving questions, similar to fill-in-the-blank questions, the answer should be a list of answers. Separate answers for different sub-questions with commas, and note that some sub-questions may have multiple answers. + +If the response sentence provides multiple different answers, carefully determine whether a later provided answer is a correction or modification of an earlier answer. If so, extract this corrected or modified answer as the final answer. Conversely, if the response sentence fluctuates between multiple answers without a clear final answer, you should output [No valid answer]. +— +Question type: {question_type} +Question: {question} +Output sentences: {response} +Key extracted answer: +''' + +JUDGE_PROMPT_CN = '''请你作为一个数学阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准: +1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题和问答题,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。 +2. 有些答案可能通过不同的方式表达,比如有些答案可能是一个数学表达式,有些答案可能是一个文字描述,只要表达的意思一致即可。且有些公式通过不同的方式表达,但等价,也是正确的。 +3. 你不需要重新计算问题答案,因为标准答案已经给出,只需要根据问题形式来判断考生的答案是否与标准答案一致,是否正确即可。 + +请你根据上述标准,判断下面的答案是否与标准答案一致,如果一致,请在最后输出\\boxed{{yes}}, 否则输出\\boxed{{no}}, 如果难以判断,请输出\\boxed{{no}}. +原问题:{question} +标准答案:{gold_answer} +考生答案:{answer} + +分析: +''' + +JUDGE_PROMPT_EN = '''Please act as an expert in grading mathematics exam papers, and judge whether the following answers match the standard answers, i.e., whether the examinee answered correctly. Here are some evaluation criteria: + +1. Some answers may contain multiple parts, such as single-choice questions, multiple-choice questions, fill-in-the-blank questions, and problem-solving questions. As long as the answer matches the standard answer, it is considered correct. For multiple-choice questions and fill-in-the-blank questions with multiple blanks, the examinee must answer all corresponding options or blanks correctly to be considered correct. +2. Some answers may be expressed in different ways; for example, some answers may be mathematical expressions, while others may be textual descriptions. As long as the meaning conveyed is consistent, it is considered correct. Additionally, some formulas may be expressed differently but are equivalent, which is also considered correct. +3. You do not need to recalculate the problem answers, as the standard answers are already provided. You only need to judge whether the examinee's answer matches the standard answer based on the form of the question and whether it is correct. + +Please judge whether the following answer matches the standard answer according to the above criteria. If they match, output \\boxed{{yes}}, otherwise output \\boxed{{no}}. If it is difficult to judge, also output \\boxed{{no}}. +Original Question: {question} +Standard Answer: {gold_answer} +Examinee's Answer: {answer} + +Analysis: +''' + +PROMPT_CN = '''下面是一个{question_type}类型的数学问题,请逐步推理,并把最终答案放置于\\boxed{{}}中。 +{question} +''' + +PROMPT_EN = '''Here is a {question_type} type math problem, please reasoning step by step, and put your answer in \\boxed{{}}. +{question} +''' From 9de27b4d85d9fd8439d43f6fe133338c741b427e Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 2 Dec 2024 11:42:07 +0800 Subject: [PATCH 13/19] [Update] Update max_out_len for datasets (#1726) * [Update] Update max_out_len for datasets * Update eval_regression_chat_objective_fullbench.py * Update eval_regression_chat.py * Update eval_regression_chat.py * Update oc_score_baseline_fullbench.yaml --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> --- ...val_regression_chat_objective_fullbench.py | 4 +- .../scripts/oc_score_baseline_fullbench.yaml | 2 +- .../compassarena/compassarena_compare.py | 2 +- .../gsm8k/gsm8k_0shot_v2_gen_6e39a4.py | 37 +++++ .../configs/datasets/musr/musr_gen_3622bb.py | 135 ++++++++++++++++++ .../compassarena/compassarena_compare.py | 2 +- .../lmdeploy_mistral_large_instruct_2411.py | 22 +++ opencompass/datasets/__init__.py | 1 + opencompass/models/gemini_api.py | 31 ++-- 9 files changed, 218 insertions(+), 18 deletions(-) create mode 100644 opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py create mode 100644 opencompass/configs/datasets/musr/musr_gen_3622bb.py create mode 100644 opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py index c66fba33..368fe040 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -22,7 +22,7 @@ with read_base(): from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ gpqa_datasets # noqa: F401, E501 # new datasets in Fullbench v1.1 - from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \ gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ hellaswag_datasets # noqa: F401, E501 @@ -46,7 +46,7 @@ with read_base(): mmlu_pro_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \ mmmlu_lite_datasets # noqa: F401, E501 - from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \ + from opencompass.configs.datasets.musr.musr_gen_3622bb import \ musr_datasets # noqa: F401, E501 from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ nq_datasets # noqa: F401, E501 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 413a99a3..49393e05 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -70,7 +70,7 @@ internlm2_5-7b-chat-turbomind_fullbench: drop: 75 hellaswag: 81.25 TheoremQA: 6.25 - musr_average: 39.58 + musr_average: 37.5 gsm8k: 68.75 math: 75 GPQA_diamond: 25 diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index e175a787..2c9b3e9b 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py new file mode 100644 index 00000000..3888678c --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), +) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/musr/musr_gen_3622bb.py b/opencompass/configs/datasets/musr/musr_gen_3622bb.py new file mode 100644 index 00000000..93c065f0 --- /dev/null +++ b/opencompass/configs/datasets/musr/musr_gen_3622bb.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py index e175a787..2c9b3e9b 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py b/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py new file mode 100644 index 00000000..205dc27b --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py @@ -0,0 +1,22 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-large-instruct-2411-turbomind', + path='mistralai/Mistral-Large-Instruct-2411', + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=4, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 20e6fecb..a590b8dd 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -6,6 +6,7 @@ from .anli import AnliDataset # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403 from .apps import * # noqa: F401, F403 from .arc import * # noqa: F401, F403 +from .arc_prize_public_evaluation import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 from .babilong import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index 7695b218..0020cf22 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -44,11 +44,13 @@ class Gemini(BaseAPIModel): top_p: float = 0.8, top_k: float = 10.0, ): - super().__init__(path=path, - max_seq_len=max_seq_len, - query_per_second=query_per_second, - meta_template=meta_template, - retry=retry) + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) assert isinstance(key, str) if key == 'ENV': if 'GEMINI_API_KEY' not in os.environ: @@ -56,8 +58,11 @@ class Gemini(BaseAPIModel): key = os.getenv('GEMINI_API_KEY') assert path in [ - 'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash', - 'gemini-1.5-pro' + 'gemini-1.0-pro', + 'gemini-pro', + 'gemini-1.5-flash', + 'gemini-1.5-pro', + 'gemini-1.5-pro-latest', ] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}' @@ -147,19 +152,19 @@ class Gemini(BaseAPIModel): 'safetySettings': [ { 'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_HATE_SPEECH', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_HARASSMENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, ], 'generationConfig': { @@ -167,8 +172,8 @@ class Gemini(BaseAPIModel): 'temperature': self.temperature, 'maxOutputTokens': 2048, 'topP': self.top_p, - 'topK': self.top_k - } + 'topK': self.top_k, + }, } for _ in range(self.retry): From 98c4666d657619eb5ef99bb2fdbb665ab65dc777 Mon Sep 17 00:00:00 2001 From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:20:58 +0800 Subject: [PATCH 14/19] [Update] Update Korbench dataset abbr (#1729) Co-authored-by: yufeng zhao --- .../configs/datasets/korbench/korbench_single_0_shot_gen.py | 2 +- .../configs/datasets/korbench/korbench_single_3_shot_gen.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py index d04c9f60..8a7824b7 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py @@ -48,7 +48,7 @@ for category in categories: korbench_dataset = dict( type=korbenchDataset, - abbr=f"korbench_{category}_0shot", + abbr=f"korbench_{category}", path="opencompass/korbench", mode='0_shot', category=category, diff --git a/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py index 0d70f5f8..dc959189 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py @@ -49,7 +49,7 @@ for category in categories: korbench_dataset = dict( type=korbenchDataset, - abbr=f"korbench_{category}_3shot", + abbr=f"korbench_{category}", path="opencompass/korbench", mode='3_shot', category=category, From e2a290fd46f900cfb7a7e86f79d2c763dad17c43 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Tue, 3 Dec 2024 19:34:57 +0800 Subject: [PATCH 15/19] [Bump] Bump version to 0.3.7 (#1733) --- opencompass/__init__.py | 6 ++++-- .../deepseek/lmdeploy_deepseek_v2_lite.py | 2 +- .../configs/models/gemma/lmdeploy_gemma_27b.py | 17 +++++++++++++++++ .../configs/models/gemma/lmdeploy_gemma_9b.py | 17 +++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 opencompass/configs/models/gemma/lmdeploy_gemma_27b.py create mode 100644 opencompass/configs/models/gemma/lmdeploy_gemma_9b.py diff --git a/opencompass/__init__.py b/opencompass/__init__.py index 0172d916..fba123f3 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1,8 +1,9 @@ -__version__ = '0.3.6' +__version__ = '0.3.7' def _warn_about_config_migration(): import warnings + warnings.warn( 'Starting from v0.4.0, all AMOTIC configuration files currently ' 'located in `./configs/datasets`, `./configs/models`, and ' @@ -10,7 +11,8 @@ def _warn_about_config_migration(): '`opencompass/configs/` package. Please update your configuration ' 'file paths accordingly.', UserWarning, # Changed to UserWarning - stacklevel=2) + stacklevel=2, + ) # Trigger the warning diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py index bd67b684..0623ee02 100644 --- a/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2_lite.py @@ -3,7 +3,7 @@ from opencompass.models import TurboMindModelwithChatTemplate models = [ dict( type=TurboMindModelwithChatTemplate, - abbr='deepseek-v2_lite-turbomind', + abbr='deepseek-v2_lite-chat-turbomind', path='deepseek-ai/DeepSeek-V2-Lite-Chat', engine_config=dict( session_len=7168, diff --git a/opencompass/configs/models/gemma/lmdeploy_gemma_27b.py b/opencompass/configs/models/gemma/lmdeploy_gemma_27b.py new file mode 100644 index 00000000..27867ef9 --- /dev/null +++ b/opencompass/configs/models/gemma/lmdeploy_gemma_27b.py @@ -0,0 +1,17 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='gemma-2-27b-turbomind', + path='google/gemma-2-27b', + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/gemma/lmdeploy_gemma_9b.py b/opencompass/configs/models/gemma/lmdeploy_gemma_9b.py new file mode 100644 index 00000000..44ac3501 --- /dev/null +++ b/opencompass/configs/models/gemma/lmdeploy_gemma_9b.py @@ -0,0 +1,17 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='gemma-2-9b-turbomind', + path='google/gemma-2-9b', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] From a011be6798dac2e16f76630b6816d86acfa2f1e0 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 4 Dec 2024 18:03:12 +0800 Subject: [PATCH 16/19] [Feature] DLC runner Lark report (#1735) * [Bump] Bump version to 0.3.7 * DLC lark report update --- opencompass/runners/base.py | 3 ++- opencompass/runners/dlc.py | 54 +++++++++++++++++++++++-------------- opencompass/runners/volc.py | 2 +- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/opencompass/runners/base.py b/opencompass/runners/base.py index 44bb2ac9..3cd2820f 100644 --- a/opencompass/runners/base.py +++ b/opencompass/runners/base.py @@ -77,7 +77,8 @@ class BaseRunner: else: content = f'{getpass.getuser()}\'s ' content += f'{self.task_cfg.type} tasks finished. ' - content += f'{num_succeeded} tasks succeeded.' + content += f'{num_succeeded} tasks succeeded.\n' + content += '\n'.join([task for task, _ in status]) self.lark_reporter.post(title='Great news: all tasks ' 'finished!', content=content) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 45f7ec82..22a189d5 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -15,7 +15,7 @@ from mmengine.config import ConfigDict from mmengine.utils import track_parallel_progress from opencompass.registry import RUNNERS, TASKS -from opencompass.utils import get_logger +from opencompass.utils import LarkReporter, get_logger from .base import BaseRunner @@ -35,15 +35,17 @@ class DLCRunner(BaseRunner): lark_bot_url (str): Lark bot url. Default: None. """ - def __init__(self, - task: ConfigDict, - aliyun_cfg: ConfigDict, - max_num_workers: int = 32, - eval_with_gpu: list = ['plugin_eval'], - retry: int = 2, - debug: bool = False, - lark_bot_url: str = None, - keep_tmp_file: bool = False): + def __init__( + self, + task: ConfigDict, + aliyun_cfg: ConfigDict, + max_num_workers: int = 32, + eval_with_gpu: list = ['plugin_eval'], + retry: int = 2, + debug: bool = False, + lark_bot_url: str = None, + keep_tmp_file: bool = True, + ): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers @@ -51,6 +53,10 @@ class DLCRunner(BaseRunner): self.eval_with_gpu = eval_with_gpu self.keep_tmp_file = keep_tmp_file + if lark_bot_url: + self.lark_reporter = LarkReporter(lark_bot_url) + else: + self.lark_reporter = None logger = get_logger() logger.warning( 'To ensure the integrity of the log results, the log displayed ' @@ -68,10 +74,12 @@ class DLCRunner(BaseRunner): """ if not self.debug: - status = track_parallel_progress(self._launch, - tasks, - nproc=self.max_num_workers, - keep_order=False) + status = track_parallel_progress( + self._launch, + tasks, + nproc=self.max_num_workers, + keep_order=False, + ) else: status = [self._launch(task, random_sleep=False) for task in tasks] return status @@ -92,7 +100,7 @@ class DLCRunner(BaseRunner): tuple[str, int]: Task name and exit code. """ if random_sleep is None: - random_sleep = (self.max_num_workers > 32) + random_sleep = self.max_num_workers > 32 task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type'])) num_gpus = task.num_gpus @@ -109,6 +117,7 @@ class DLCRunner(BaseRunner): mmengine.mkdir_or_exist('tmp/') # Using uuid to avoid filename conflict import uuid + uuid_str = str(uuid.uuid4()) param_file = f'tmp/{uuid_str}_params.py' pwd = os.getcwd() @@ -201,7 +210,8 @@ class DLCRunner(BaseRunner): if self.aliyun_cfg['python_env_path']: cmd = cmd.replace( sys.executable, - f'{self.aliyun_cfg["python_env_path"]}/bin/python') + f'{self.aliyun_cfg["python_env_path"]}/bin/python', + ) logger = get_logger() logger.debug(f'Running command: {cmd}') @@ -259,10 +269,9 @@ class DLCRunner(BaseRunner): try: raw_job_info = subprocess.getoutput( f'dlc get job {job_id}{config_path}') - if raw_job_info.startswith( - '/bin/bash') or raw_job_info.startswith( - '[OK]') or raw_job_info.startswith( - '[FAILED]'): + if (raw_job_info.startswith('/bin/bash') + or raw_job_info.startswith('[OK]') + or raw_job_info.startswith('[FAILED]')): raw_job_info = raw_job_info[raw_job_info. index('\n') + 1:] job_info = json.loads(raw_job_info) @@ -325,6 +334,11 @@ class DLCRunner(BaseRunner): else: pass + # Lark Report when failed + if return_code == -1: + content = f'DLC job failed. Task name: {task_name}' + self.lark_reporter.post(title='DLC job failed', content=content) + return task_name, return_code def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index d48f7a43..b5f38223 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -48,7 +48,7 @@ class VOLCRunner(BaseRunner): retry: int = 2, debug: bool = False, lark_bot_url: str = None, - keep_tmp_file: bool = False): + keep_tmp_file: bool = True): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.volcano_cfg = volcano_cfg self.max_num_workers = max_num_workers From 4d773904d418c2cdbce78f454a154a0528b72057 Mon Sep 17 00:00:00 2001 From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:24:35 +0800 Subject: [PATCH 17/19] [Update] Korbench readme supplementation (#1734) * renewed * readme --------- Co-authored-by: yufeng zhao --- .../configs/datasets/korbench/readme.md | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 opencompass/configs/datasets/korbench/readme.md diff --git a/opencompass/configs/datasets/korbench/readme.md b/opencompass/configs/datasets/korbench/readme.md new file mode 100644 index 00000000..48dd4855 --- /dev/null +++ b/opencompass/configs/datasets/korbench/readme.md @@ -0,0 +1,71 @@ +# KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks + +KOR-Bench is a dataset designed to evaluate large language models (LLMs) on tasks that require reasoning independent of prior knowledge. Created to assess reasoning and planning abilities, KOR-Bench introduces rule-based tasks that minimize the influence of pretrained knowledge, enabling a focused evaluation of intrinsic model capabilities. + +## Overview + +### Purpose + +Large language models, such as GPT-4 and Claude, excel in knowledge-based tasks but face challenges in applying reasoning skills to unfamiliar scenarios. KOR-Bench is built to evaluate such reasoning capabilities across five categories: +- **Operation**: Arithmetic and logical operations. +- **Logic**: Complex deductive and inductive reasoning. +- **Cipher**: Code-breaking and pattern discovery. +- **Puzzle**: Problem-solving with creative and logical reasoning. +- **Counterfactual**: Hypothetical reasoning in alternate scenarios. + +### Dataset Construction + +KOR-Bench tasks are designed with novel rules and configurations, ensuring no reliance on pretrained knowledge. Each task includes: +- **Rules**: Custom rule sets to guide reasoning. +- **Questions**: Carefully crafted problems that require the application of rules. +- **Evaluation Scenarios**: Zero-shot, three-shot, and subquestion-specific configurations. + +The dataset is structured to assess multistep reasoning, pattern recognition, and adaptability to new rules. + +### Dataset Access + +KOR-Bench is publicly available with detailed usage instructions in the [GitHub Repository](https://github.com/KOR-Bench/KOR-Bench). Download the dataset and leverage predefined evaluation scripts or customize your own. + +### Evaluation + +1. Install dependencies and configure your environment. +2. Run evaluations using `opencompass configs/eval_korbench.py` to assess LLM performance. +3. Analyze model performance across various reasoning tasks. + +### Example Command +```bash +opencompass configs/eval_korbench.py +``` + +## Baselines and Results +KOR-Bench includes baseline results for leading LLMs evaluated across various configurations, including zero-shot (gen) and few-shot modes. Below is a summary of the results. +| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | internlm2_5-1_8b-chat-turbomind | llama-3_1-8b-instruct-turbomind | glm-4-9b-chat-turbomind | gemma-2-9b-it-turbomind | +|---------|---------|--------|------|--------------------------------|---------------------------------|---------------------------------|--------------------------|--------------------------| +| korbench_mixed_Multi-Q | 21f998 | accuracy | gen | 0.60 | 0.20 | 9.60 | 8.70 | 7.80 | +| korbench_mixed_Multi-R | 21f998 | accuracy | gen | 1.70 | 0.10 | 8.80 | 12.10 | 9.80 | +| korbench_mixed_Multi-RQ | 21f998 | accuracy | gen | 1.50 | 0.10 | 6.40 | 8.60 | 6.00 | +| korbench_cipher | 21f998 | accuracy | gen | 8.80 | 0.80 | 14.00 | 6.80 | 6.40 | +| korbench_counterfactual | 21f998 | accuracy | gen | 83.60 | 17.20 | 88.80 | 90.40 | 87.60 | +| korbench_logic | 21f998 | accuracy | gen | 8.40 | 3.60 | 37.60 | 38.80 | 40.80 | +| korbench_operation | 21f998 | accuracy | gen | 56.00 | 25.20 | 68.40 | 63.60 | 67.60 | +| korbench_puzzle | 21f998 | accuracy | gen | 3.60 | 0.00 | 3.20 | 3.20 | 5.60 | +| korbench_cipher | 21f998 | accuracy | fewshot | 8.40 | 3.20 | 9.60 | 9.20 | 9.60 | +| korbench_counterfactual | 21f998 | accuracy | fewshot | 87.60 | 58.00 | 23.60 | 89.60 | 84.40 | +| korbench_logic | 21f998 | accuracy | fewshot | 45.20 | 19.60 | 24.40 | 38.40 | 54.00 | +| korbench_operation | 21f998 | accuracy | fewshot | 24.80 | 11.20 | 73.20 | 67.20 | 23.20 | +| korbench_puzzle | 21f998 | accuracy | fewshot | 4.80 | 2.40 | 1.60 | 3.60 | 6.80 | + +### Citation + +**BibTeX:** +```bibtex +@misc{ma2024korbenchbenchmarkinglanguagemodels, +title={KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks}, +author={Kaijing Ma and Xinrun Du and Yunran Wang and Haoran Zhang and Zhoufutu Wen and Xingwei Qu and Jian Yang and Jiaheng Liu and Minghao Liu and Xiang Yue and Wenhao Huang and Ge Zhang}, +year={2024}, +eprint={2410.06526}, +archivePrefix={arXiv}, +primaryClass={cs.DB}, +url={https://arxiv.org/abs/2410.06526}, +} +``` From ac23f0ce1f5a822c3538005b7310f2444b2bc483 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 5 Dec 2024 11:26:00 +0800 Subject: [PATCH 18/19] [Update] Update init file for Korbench (#1737) --- opencompass/datasets/korbench/korbench_dataset_config/__init__.py | 0 .../datasets/korbench/korbench_dataset_config/prompt/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/__init__.py create mode 100644 opencompass/datasets/korbench/korbench_dataset_config/prompt/__init__.py diff --git a/opencompass/datasets/korbench/korbench_dataset_config/__init__.py b/opencompass/datasets/korbench/korbench_dataset_config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/opencompass/datasets/korbench/korbench_dataset_config/prompt/__init__.py b/opencompass/datasets/korbench/korbench_dataset_config/prompt/__init__.py new file mode 100644 index 00000000..e69de29b From 4f317d1bd511c0fe78ecd621add6ed2b84913440 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 5 Dec 2024 13:59:56 +0800 Subject: [PATCH 19/19] [Update] Update Manifest (#1738) --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 63d03d04..5d1a70ef 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ recursive-include opencompass/configs *.py *.yml *.json *.txt *.md recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py +recursive-include opencompass/datasets *.py *.yml *.json *.txt *.md *.yaml