2023-07-04 21:34:55 +08:00
|
|
|
|
import re
|
2023-11-27 19:57:36 +08:00
|
|
|
|
from typing import Callable, Optional, Union
|
2023-07-04 21:34:55 +08:00
|
|
|
|
|
|
|
|
|
from opencompass.registry import TEXT_POSTPROCESSORS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general')
|
|
|
|
|
def general_postprocess(text: str) -> str:
|
|
|
|
|
# Cut off the first newline, period, or comma
|
|
|
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
|
|
|
|
|
|
# Remove punctuation
|
|
|
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
|
|
|
|
|
|
# Remove article
|
|
|
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
|
|
|
'',
|
|
|
|
|
no_punctuation,
|
|
|
|
|
flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
# Remove duplicated blank spaces
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
|
|
|
|
|
|
|
|
return cleaned_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general_cn')
|
|
|
|
|
def general_cn_postprocess(text: str) -> str:
|
|
|
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
|
|
|
|
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
|
|
|
|
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
|
|
|
'',
|
|
|
|
|
no_punctuation,
|
|
|
|
|
flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
|
|
|
import jieba
|
|
|
|
|
cleaned_text = ' '.join(jieba.cut(text))
|
|
|
|
|
return cleaned_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital')
|
|
|
|
|
def first_capital_postprocess(text: str) -> str:
|
|
|
|
|
for t in text:
|
|
|
|
|
if t.isupper():
|
|
|
|
|
return t
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
2023-12-11 17:42:53 +08:00
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('last-capital')
|
|
|
|
|
def last_capital_postprocess(text: str) -> str:
|
|
|
|
|
for t in text[::-1]:
|
|
|
|
|
if t.isupper():
|
|
|
|
|
return t
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
2024-01-08 22:07:24 +08:00
|
|
|
|
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
2023-08-11 17:33:00 +08:00
|
|
|
|
"""Find first valid option for text."""
|
|
|
|
|
|
2023-11-27 16:06:49 +08:00
|
|
|
|
# yapf: disable
|
|
|
|
|
# flake8: noqa: W605
|
2023-08-11 17:33:00 +08:00
|
|
|
|
patterns = [
|
2024-05-28 23:09:59 +08:00
|
|
|
|
f'答案是?\s*([{options}])',
|
|
|
|
|
f'答案是?\s*:\s*([{options}])',
|
|
|
|
|
f'答案是?\s*:\s*([{options}])',
|
2024-08-12 19:09:19 +08:00
|
|
|
|
f'答案选项应?该?是\s*([{options}])',
|
|
|
|
|
f'答案选项应?该?为\s*([{options}])',
|
2024-05-28 23:09:59 +08:00
|
|
|
|
f'答案应该?是\s*([{options}])',
|
|
|
|
|
f'答案应该?选\s*([{options}])',
|
2024-08-12 19:09:19 +08:00
|
|
|
|
f'答案选项为?\s*:\s*([{options}])',
|
2024-10-29 18:37:15 +08:00
|
|
|
|
f'答案选项为?\s+\(?\*?\*?([{options}])\*?\*?\)?',
|
2024-08-12 19:09:19 +08:00
|
|
|
|
f'答案选项是?\s*:\s*([{options}])',
|
2024-05-28 23:09:59 +08:00
|
|
|
|
f'答案为\s*([{options}])',
|
|
|
|
|
f'答案选\s*([{options}])',
|
|
|
|
|
f'选择?\s*([{options}])',
|
|
|
|
|
f'故选?\s*([{options}])'
|
2023-11-27 16:06:49 +08:00
|
|
|
|
f'只有选?项?\s?([{options}])\s?是?对',
|
|
|
|
|
f'只有选?项?\s?([{options}])\s?是?错',
|
|
|
|
|
f'只有选?项?\s?([{options}])\s?不?正确',
|
|
|
|
|
f'只有选?项?\s?([{options}])\s?错误',
|
|
|
|
|
f'说法不?对选?项?的?是\s?([{options}])',
|
|
|
|
|
f'说法不?正确选?项?的?是\s?([{options}])',
|
|
|
|
|
f'说法错误选?项?的?是\s?([{options}])',
|
|
|
|
|
f'([{options}])\s?是正确的',
|
|
|
|
|
f'([{options}])\s?是正确答案',
|
|
|
|
|
f'选项\s?([{options}])\s?正确',
|
|
|
|
|
f'所以答\s?([{options}])',
|
|
|
|
|
f'所以\s?([{options}][.。$]?$)',
|
|
|
|
|
f'所有\s?([{options}][.。$]?$)',
|
|
|
|
|
f'[\s,::,]([{options}])[。,,\.]?$',
|
|
|
|
|
f'[\s,,::][故即]([{options}])[。\.]?$',
|
|
|
|
|
f'[\s,,::]因此([{options}])[。\.]?$',
|
|
|
|
|
f'[是为。]\s?([{options}])[。\.]?$',
|
|
|
|
|
f'因此\s?([{options}])[。\.]?$',
|
|
|
|
|
f'显然\s?([{options}])[。\.]?$',
|
|
|
|
|
f'答案是\s?(\S+)(?:。|$)',
|
|
|
|
|
f'答案应该是\s?(\S+)(?:。|$)',
|
|
|
|
|
f'答案为\s?(\S+)(?:。|$)',
|
2024-08-28 14:01:05 +08:00
|
|
|
|
f'(?i)ANSWER\s*:\s*([{options}])',
|
2024-05-14 22:42:23 +08:00
|
|
|
|
f'[Tt]he answer is:?\s+\(?([{options}])\)?',
|
2024-10-29 18:37:15 +08:00
|
|
|
|
f'[Tt]he answer is:?\s+\(?\*?\*?([{options}])\*?\*?\)?',
|
2024-05-14 22:42:23 +08:00
|
|
|
|
f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
|
|
|
|
|
f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
|
|
|
|
|
f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
|
2024-08-22 14:49:07 +08:00
|
|
|
|
f'[Tt]he correct answer is:?.*?boxed{{([{options}])}}',
|
|
|
|
|
f'[Tt]he correct option is:?.*?boxed{{([{options}])}}',
|
|
|
|
|
f'[Tt]he correct answer option is:?.*?boxed{{([{options}])}}',
|
2024-05-14 22:42:23 +08:00
|
|
|
|
f'[Tt]he answer to the question is:?\s+\(?([{options}])\)?',
|
2024-01-08 22:07:24 +08:00
|
|
|
|
f'^选项\s?([{options}])',
|
|
|
|
|
f'^([{options}])\s?选?项',
|
|
|
|
|
f'(\s|^)[{options}][\s。,,::\.$]',
|
|
|
|
|
f'1.\s?(.*?)$',
|
2024-02-05 23:29:10 +08:00
|
|
|
|
f'1.\s?([{options}])[.。$]?$',
|
2024-01-08 22:07:24 +08:00
|
|
|
|
]
|
|
|
|
|
cushion_patterns = [
|
2023-11-27 16:06:49 +08:00
|
|
|
|
f'([{options}]):',
|
2024-06-28 14:16:34 +08:00
|
|
|
|
f'([{options}])',
|
2023-08-11 17:33:00 +08:00
|
|
|
|
]
|
2023-11-27 16:06:49 +08:00
|
|
|
|
# flake8: noqa
|
|
|
|
|
# yapf: enable
|
2023-08-11 17:33:00 +08:00
|
|
|
|
|
2024-01-08 22:07:24 +08:00
|
|
|
|
if cushion:
|
|
|
|
|
patterns.extend(cushion_patterns)
|
|
|
|
|
for pattern in patterns:
|
2024-08-12 19:09:19 +08:00
|
|
|
|
text = text.strip()
|
2024-05-14 22:42:23 +08:00
|
|
|
|
match = re.search(pattern, text, re.DOTALL)
|
2023-08-11 17:33:00 +08:00
|
|
|
|
if match:
|
2024-11-18 20:14:29 +08:00
|
|
|
|
if match.group(1) is not None and match.group(1) != '':
|
|
|
|
|
outputs = match.group(1)
|
|
|
|
|
else:
|
|
|
|
|
outputs = match.group(0)
|
2023-08-11 17:33:00 +08:00
|
|
|
|
for i in options:
|
|
|
|
|
if i in outputs:
|
|
|
|
|
return i
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
2023-07-04 21:34:55 +08:00
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
|
|
|
|
|
def first_capital_postprocess_multi(text: str) -> str:
|
|
|
|
|
match = re.search(r'([A-D]+)', text)
|
|
|
|
|
if match:
|
|
|
|
|
return match.group(1)
|
|
|
|
|
return ''
|
2023-08-25 16:12:21 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def last_option_postprocess(text: str, options: str) -> str:
|
|
|
|
|
match = re.findall(rf'([{options}])', text)
|
|
|
|
|
if match:
|
|
|
|
|
return match[-1]
|
|
|
|
|
return ''
|
2023-09-22 15:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def first_number_postprocess(text: str) -> float:
|
|
|
|
|
"""Return the first number in a string."""
|
|
|
|
|
# regex pattern to match numbers (both integers and decimals)
|
|
|
|
|
pattern = r'(-?\d*\.?\d+)'
|
|
|
|
|
|
|
|
|
|
# search the string for the pattern
|
|
|
|
|
match = re.search(pattern, text)
|
|
|
|
|
|
|
|
|
|
# if a match is found, return it. Otherwise, return None.
|
|
|
|
|
return float(match.group(1)) if match else None
|
2023-11-13 00:09:05 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('multiple-select')
|
|
|
|
|
def multiple_select_postprocess(text: str) -> str:
|
|
|
|
|
ret = set([t for t in text if t.isupper()])
|
|
|
|
|
return ''.join(sorted(ret))
|
2023-11-27 19:57:36 +08:00
|
|
|
|
|
|
|
|
|
|
2025-01-07 00:14:32 +08:00
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('specific-xml-tag')
|
|
|
|
|
def xml_tag_postprocessor(text, tag):
|
|
|
|
|
"""Extracts content enclosed within a specified XML-style tag from a
|
|
|
|
|
string.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
texts: The input string containing XML-style tags.
|
|
|
|
|
tag: The XML-style tag to extract content from (e.g., "<conclude>"). Must include the angle brackets.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The content enclosed within the specified tag, or None if the tag is not found.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Use a regular expression to find the content within the specified tag. This handles cases where the tag might appear multiple times.
|
|
|
|
|
matches = re.findall(
|
|
|
|
|
rf'{tag}(.*?)</{tag[1:-1]}>', text,
|
|
|
|
|
re.DOTALL) # re.DOTALL allows . to match newline characters
|
|
|
|
|
|
|
|
|
|
if matches:
|
|
|
|
|
# Only keep the last one
|
|
|
|
|
output = matches[-1].strip(
|
|
|
|
|
) # Extract the content and remove leading/trailing whitespace
|
|
|
|
|
else:
|
|
|
|
|
output = 'NO ANSWER FOUND'
|
|
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
2023-11-27 19:57:36 +08:00
|
|
|
|
def general_eval_wrapper_postprocess(text: str,
|
|
|
|
|
postprocess: Optional[Union[
|
|
|
|
|
str, Callable]] = None,
|
|
|
|
|
**kwargs) -> str:
|
|
|
|
|
"""Wrapper for eval text repr. Especially for chatglmpro.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text(str): Text to be postprocessed.
|
|
|
|
|
postprocess(Callable, optional): Original post processing function.
|
|
|
|
|
Defaults to None.
|
|
|
|
|
**kwargs: Other necessary kwargs for post processing function.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
text = eval(text)
|
|
|
|
|
except Exception:
|
|
|
|
|
# in case empty input or other error, skip eval
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
if postprocess:
|
|
|
|
|
if isinstance(postprocess, str):
|
|
|
|
|
postprocess = TEXT_POSTPROCESSORS.get(postprocess)
|
|
|
|
|
return postprocess(text, **kwargs)
|
|
|
|
|
else:
|
|
|
|
|
return text
|
2024-05-06 13:26:26 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def match_answer_pattern(response_text: str, answer_pattern: str):
|
|
|
|
|
match = re.search(answer_pattern, response_text)
|
|
|
|
|
extracted_answer = match.group(1) if match else ''
|
|
|
|
|
return extracted_answer
|