OpenCompass/opencompass/utils/text_postprocessors.py
Tong Gao f480b72703
[Feature] Support model-bound prediction postprocessor, use it in Claude (#268)
* [Feature] Support model-bound text postprocessor, add claude as an example

* update

* update

* minor fix

---------

Co-authored-by: zhoufengzhe <zhoufengzhe@pjlab.org.cn>
2023-08-25 16:12:21 +08:00

89 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from opencompass.registry import TEXT_POSTPROCESSORS
@TEXT_POSTPROCESSORS.register_module('general')
def general_postprocess(text: str) -> str:
# Cut off the first newline, period, or comma
truncated_text = re.split(r'[\n.,]', text, 1)[0]
# Remove punctuation
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
# Remove article
no_articles = re.sub(r'\b(a|an|the)\b',
'',
no_punctuation,
flags=re.IGNORECASE)
# Remove duplicated blank spaces
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
return cleaned_text
@TEXT_POSTPROCESSORS.register_module('general_cn')
def general_cn_postprocess(text: str) -> str:
truncated_text = re.split(r'[\n.,]', text, 1)[0]
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
no_articles = re.sub(r'\b(a|an|the)\b',
'',
no_punctuation,
flags=re.IGNORECASE)
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
import jieba
cleaned_text = ' '.join(jieba.cut(text))
return cleaned_text
@TEXT_POSTPROCESSORS.register_module('first-capital')
def first_capital_postprocess(text: str) -> str:
for t in text:
if t.isupper():
return t
return ''
def first_option_postprocess(text: str, options: str) -> str:
"""Find first valid option for text."""
patterns = [
f'[Tt]he answer is [{options}]',
f'[Tt]he correct answer is [{options}]',
f'答案是(.*?)[{options}]',
f'答案为(.*?)[{options}]',
f'固选(.*?)[{options}]',
f'答案应该是(.*?)[{options}]',
f'(\s|^)[{options}][\s。,\.$]', # noqa
f'[{options}]',
]
regexes = [re.compile(pattern) for pattern in patterns]
for regex in regexes:
match = regex.search(text)
if match:
outputs = match.group(0)
for i in options:
if i in outputs:
return i
return ''
@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
def first_capital_postprocess_multi(text: str) -> str:
match = re.search(r'([A-D]+)', text)
if match:
return match.group(1)
return ''
def last_option_postprocess(text: str, options: str) -> str:
match = re.findall(rf'([{options}])', text)
if match:
return match[-1]
return ''