2023-07-04 21:34:55 +08:00
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from opencompass.registry import TEXT_POSTPROCESSORS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general')
|
|
|
|
|
def general_postprocess(text: str) -> str:
|
|
|
|
|
# Cut off the first newline, period, or comma
|
|
|
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
|
|
|
|
|
|
# Remove punctuation
|
|
|
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
|
|
|
|
|
|
# Remove article
|
|
|
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
|
|
|
'',
|
|
|
|
|
no_punctuation,
|
|
|
|
|
flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
# Remove duplicated blank spaces
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
|
|
|
|
|
|
|
|
return cleaned_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general_cn')
|
|
|
|
|
def general_cn_postprocess(text: str) -> str:
|
|
|
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
|
|
|
|
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
|
|
|
|
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
|
|
|
'',
|
|
|
|
|
no_punctuation,
|
|
|
|
|
flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
|
|
|
import jieba
|
|
|
|
|
cleaned_text = ' '.join(jieba.cut(text))
|
|
|
|
|
return cleaned_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital')
|
|
|
|
|
def first_capital_postprocess(text: str) -> str:
|
|
|
|
|
for t in text:
|
|
|
|
|
if t.isupper():
|
|
|
|
|
return t
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
2023-08-11 17:33:00 +08:00
|
|
|
|
def first_option_postprocess(text: str, options) -> str:
|
|
|
|
|
"""Find first valid option for text."""
|
|
|
|
|
|
|
|
|
|
patterns = [
|
|
|
|
|
f'[Tt]he answer is [{options}]',
|
|
|
|
|
f'[Tt]he correct answer is [{options}]',
|
|
|
|
|
f'答案是(.*?)[{options}]',
|
|
|
|
|
f'答案为(.*?)[{options}]',
|
|
|
|
|
f'固选(.*?)[{options}]',
|
|
|
|
|
f'答案应该是(.*?)[{options}]',
|
|
|
|
|
f'(\s|^)[{options}][\s。,,\.$]', # noqa
|
|
|
|
|
f'[{options}]',
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
|
|
|
for regex in regexes:
|
|
|
|
|
match = regex.search(text)
|
|
|
|
|
if match:
|
|
|
|
|
outputs = match.group(0)
|
|
|
|
|
for i in options:
|
|
|
|
|
if i in outputs:
|
|
|
|
|
return i
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
2023-07-04 21:34:55 +08:00
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
|
|
|
|
|
def first_capital_postprocess_multi(text: str) -> str:
|
|
|
|
|
match = re.search(r'([A-D]+)', text)
|
|
|
|
|
if match:
|
|
|
|
|
return match.group(1)
|
|
|
|
|
return ''
|