From 8a3c6e51ed3eebc66bbb253e1bae23d961dae8d0 Mon Sep 17 00:00:00 2001 From: Connor-Shen <0xconnor3@gmail.com> Date: Tue, 19 Mar 2024 15:47:05 +0800 Subject: [PATCH] [Feature] Update APPS (#985) * update post process * update post process --- configs/datasets/apps/README.md | 2 ++ configs/datasets/apps/apps_gen.py | 2 +- ...{apps_gen_d82929.py => apps_gen_c7893a.py} | 2 +- opencompass/datasets/apps.py | 20 ++++++++++--------- 4 files changed, 15 insertions(+), 11 deletions(-) rename configs/datasets/apps/{apps_gen_d82929.py => apps_gen_c7893a.py} (79%) diff --git a/configs/datasets/apps/README.md b/configs/datasets/apps/README.md index da8e8595..abe77bf4 100644 --- a/configs/datasets/apps/README.md +++ b/configs/datasets/apps/README.md @@ -15,6 +15,8 @@ DatasetDict({ }) }) ``` +We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each). + ## How to Use You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level: ```python diff --git a/configs/datasets/apps/apps_gen.py b/configs/datasets/apps/apps_gen.py index 446901b2..810e2724 100644 --- a/configs/datasets/apps/apps_gen.py +++ b/configs/datasets/apps/apps_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .apps_gen_d82929 import APPS_datasets # noqa: F401, F403 + from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403 diff --git a/configs/datasets/apps/apps_gen_d82929.py b/configs/datasets/apps/apps_gen_c7893a.py similarity index 79% rename from configs/datasets/apps/apps_gen_d82929.py rename to configs/datasets/apps/apps_gen_c7893a.py index 9fa7b908..03c03a70 100644 --- a/configs/datasets/apps/apps_gen_d82929.py +++ b/configs/datasets/apps/apps_gen_c7893a.py @@ -8,7 +8,7 @@ APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro APPS_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, - template="\nQUESTION:\n{question} {starter}\nANSWER:\n"), + template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"), retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer, max_out_len=512), ) diff --git a/opencompass/datasets/apps.py b/opencompass/datasets/apps.py index 85e074f5..5a90c150 100644 --- a/opencompass/datasets/apps.py +++ b/opencompass/datasets/apps.py @@ -89,14 +89,16 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>'] @ICL_EVALUATORS.register_module() class APPSEvaluator(BaseEvaluator): - def truncate_after_eof_strings(self, text): - pattern = '|'.join(re.escape(s) for s in EOF_STRINGS) - match = re.search(pattern, text) - - if match: - return text[:match.start()] - else: - return text + def post_process(self, text): + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # starting with ```python + text = text[max(text.find('\n') + 1, 0):] + return text TIMEOUT = 10 @@ -226,7 +228,7 @@ class APPSEvaluator(BaseEvaluator): assert len(predictions) == len(references) generations = defaultdict(list) for refer, pred in zip(references, predictions): - pred = self.truncate_after_eof_strings(pred) + pred = self.post_process(pred) generations[refer].append(pred) # convert to non-duplicated version test_set = test_set.to_pandas()