mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Update APPS (#985)
* update post process * update post process
This commit is contained in:
parent
d92595b671
commit
8a3c6e51ed
@ -15,6 +15,8 @@ DatasetDict({
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
```
|
```
|
||||||
|
We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each).
|
||||||
|
|
||||||
## How to Use
|
## How to Use
|
||||||
You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
|
You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
|
||||||
```python
|
```python
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .apps_gen_d82929 import APPS_datasets # noqa: F401, F403
|
from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403
|
||||||
|
@ -8,7 +8,7 @@ APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro
|
|||||||
APPS_infer_cfg = dict(
|
APPS_infer_cfg = dict(
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template="\nQUESTION:\n{question} {starter}\nANSWER:\n"),
|
template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
)
|
)
|
@ -89,14 +89,16 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
|
|||||||
@ICL_EVALUATORS.register_module()
|
@ICL_EVALUATORS.register_module()
|
||||||
class APPSEvaluator(BaseEvaluator):
|
class APPSEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
def truncate_after_eof_strings(self, text):
|
def post_process(self, text):
|
||||||
pattern = '|'.join(re.escape(s) for s in EOF_STRINGS)
|
if '```' in text:
|
||||||
match = re.search(pattern, text)
|
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
||||||
|
if len(blocks) == 0:
|
||||||
if match:
|
text = text.split('```')[1] # fall back to default strategy
|
||||||
return text[:match.start()]
|
else:
|
||||||
else:
|
text = blocks[0] # fetch the first code block
|
||||||
return text
|
if not text.startswith('\n'): # starting with ```python
|
||||||
|
text = text[max(text.find('\n') + 1, 0):]
|
||||||
|
return text
|
||||||
|
|
||||||
TIMEOUT = 10
|
TIMEOUT = 10
|
||||||
|
|
||||||
@ -226,7 +228,7 @@ class APPSEvaluator(BaseEvaluator):
|
|||||||
assert len(predictions) == len(references)
|
assert len(predictions) == len(references)
|
||||||
generations = defaultdict(list)
|
generations = defaultdict(list)
|
||||||
for refer, pred in zip(references, predictions):
|
for refer, pred in zip(references, predictions):
|
||||||
pred = self.truncate_after_eof_strings(pred)
|
pred = self.post_process(pred)
|
||||||
generations[refer].append(pred)
|
generations[refer].append(pred)
|
||||||
# convert to non-duplicated version
|
# convert to non-duplicated version
|
||||||
test_set = test_set.to_pandas()
|
test_set = test_set.to_pandas()
|
||||||
|
Loading…
Reference in New Issue
Block a user