From 8a3c6e51ed3eebc66bbb253e1bae23d961dae8d0 Mon Sep 17 00:00:00 2001
From: Connor-Shen <0xconnor3@gmail.com>
Date: Tue, 19 Mar 2024 15:47:05 +0800
Subject: [PATCH] [Feature] Update APPS (#985)

* update post process

* update post process
---
 configs/datasets/apps/README.md               |  2 ++
 configs/datasets/apps/apps_gen.py             |  2 +-
 ...{apps_gen_d82929.py => apps_gen_c7893a.py} |  2 +-
 opencompass/datasets/apps.py                  | 20 ++++++++++---------
 4 files changed, 15 insertions(+), 11 deletions(-)
 rename configs/datasets/apps/{apps_gen_d82929.py => apps_gen_c7893a.py} (79%)

diff --git a/configs/datasets/apps/README.md b/configs/datasets/apps/README.md
index da8e8595..abe77bf4 100644
--- a/configs/datasets/apps/README.md
+++ b/configs/datasets/apps/README.md
@@ -15,6 +15,8 @@ DatasetDict({
     })
 })
 ```
+We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each).
+
 ## How to Use
 You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
 ```python
diff --git a/configs/datasets/apps/apps_gen.py b/configs/datasets/apps/apps_gen.py
index 446901b2..810e2724 100644
--- a/configs/datasets/apps/apps_gen.py
+++ b/configs/datasets/apps/apps_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .apps_gen_d82929 import APPS_datasets  # noqa: F401, F403
+    from .apps_gen_c7893a import APPS_datasets  # noqa: F401, F403
diff --git a/configs/datasets/apps/apps_gen_d82929.py b/configs/datasets/apps/apps_gen_c7893a.py
similarity index 79%
rename from configs/datasets/apps/apps_gen_d82929.py
rename to configs/datasets/apps/apps_gen_c7893a.py
index 9fa7b908..03c03a70 100644
--- a/configs/datasets/apps/apps_gen_d82929.py
+++ b/configs/datasets/apps/apps_gen_c7893a.py
@@ -8,7 +8,7 @@ APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro
 APPS_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template="\nQUESTION:\n{question} {starter}\nANSWER:\n"),
+        template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=GenInferencer, max_out_len=512),
 )
diff --git a/opencompass/datasets/apps.py b/opencompass/datasets/apps.py
index 85e074f5..5a90c150 100644
--- a/opencompass/datasets/apps.py
+++ b/opencompass/datasets/apps.py
@@ -89,14 +89,16 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
 @ICL_EVALUATORS.register_module()
 class APPSEvaluator(BaseEvaluator):
 
-    def truncate_after_eof_strings(self, text):
-        pattern = '|'.join(re.escape(s) for s in EOF_STRINGS)
-        match = re.search(pattern, text)
-
-        if match:
-            return text[:match.start()]
-        else:
-            return text
+    def post_process(self, text):
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        return text
 
     TIMEOUT = 10
 
@@ -226,7 +228,7 @@ class APPSEvaluator(BaseEvaluator):
         assert len(predictions) == len(references)
         generations = defaultdict(list)
         for refer, pred in zip(references, predictions):
-            pred = self.truncate_after_eof_strings(pred)
+            pred = self.post_process(pred)
             generations[refer].append(pred)
         # convert to non-duplicated version
         test_set = test_set.to_pandas()