mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
import re
|
|
|
|
from opencompass.registry import TEXT_POSTPROCESSORS
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general')
|
|
def general_postprocess(text: str) -> str:
|
|
# Cut off the first newline, period, or comma
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
# Remove punctuation
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
# Remove article
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
'',
|
|
no_punctuation,
|
|
flags=re.IGNORECASE)
|
|
|
|
# Remove duplicated blank spaces
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
|
|
return cleaned_text
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('general_cn')
|
|
def general_cn_postprocess(text: str) -> str:
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|
'',
|
|
no_punctuation,
|
|
flags=re.IGNORECASE)
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|
import jieba
|
|
cleaned_text = ' '.join(jieba.cut(text))
|
|
return cleaned_text
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital')
|
|
def first_capital_postprocess(text: str) -> str:
|
|
for t in text:
|
|
if t.isupper():
|
|
return t
|
|
return ''
|
|
|
|
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
|
|
def first_capital_postprocess_multi(text: str) -> str:
|
|
match = re.search(r'([A-D]+)', text)
|
|
if match:
|
|
return match.group(1)
|
|
return ''
|