OpenCompass/opencompass/utils/text_postprocessors.py

import re

from opencompass.registry import TEXT_POSTPROCESSORS


@TEXT_POSTPROCESSORS.register_module('general')
def general_postprocess(text: str) -> str:
    # Cut off the first newline, period, or comma
    truncated_text = re.split(r'[\n.,]', text, 1)[0]

    # Remove punctuation
    no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)

    # Remove article
    no_articles = re.sub(r'\b(a|an|the)\b',
                         '',
                         no_punctuation,
                         flags=re.IGNORECASE)

    # Remove duplicated blank spaces
    cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()

    return cleaned_text


@TEXT_POSTPROCESSORS.register_module('general_cn')
def general_cn_postprocess(text: str) -> str:
    truncated_text = re.split(r'[\n.,]', text, 1)[0]

    no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)

    no_articles = re.sub(r'\b(a|an|the)\b',
                         '',
                         no_punctuation,
                         flags=re.IGNORECASE)

    cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
    import jieba
    cleaned_text = ' '.join(jieba.cut(text))
    return cleaned_text


@TEXT_POSTPROCESSORS.register_module('first-capital')
def first_capital_postprocess(text: str) -> str:
    for t in text:
        if t.isupper():
            return t
    return ''


@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
def first_capital_postprocess_multi(text: str) -> str:
    match = re.search(r'([A-D]+)', text)
    if match:
        return match.group(1)
    return ''
initial commit 2023-07-04 21:34:55 +08:00			`import re`

			`from opencompass.registry import TEXT_POSTPROCESSORS`


			`@TEXT_POSTPROCESSORS.register_module('general')`
			`def general_postprocess(text: str) -> str:`
			`# Cut off the first newline, period, or comma`
			`truncated_text = re.split(r'[\n.,]', text, 1)[0]`

			`# Remove punctuation`
			`no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)`

			`# Remove article`
			`no_articles = re.sub(r'\b(a\|an\|the)\b',`
			`'',`
			`no_punctuation,`
			`flags=re.IGNORECASE)`

			`# Remove duplicated blank spaces`
			`cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()`

			`return cleaned_text`


			`@TEXT_POSTPROCESSORS.register_module('general_cn')`
			`def general_cn_postprocess(text: str) -> str:`
			`truncated_text = re.split(r'[\n.,]', text, 1)[0]`

			`no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)`

			`no_articles = re.sub(r'\b(a\|an\|the)\b',`
			`'',`
			`no_punctuation,`
			`flags=re.IGNORECASE)`

			`cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()`
			`import jieba`
			`cleaned_text = ' '.join(jieba.cut(text))`
			`return cleaned_text`


			`@TEXT_POSTPROCESSORS.register_module('first-capital')`
			`def first_capital_postprocess(text: str) -> str:`
			`for t in text:`
			`if t.isupper():`
			`return t`
			`return ''`


			`@TEXT_POSTPROCESSORS.register_module('first-capital-multi')`
			`def first_capital_postprocess_multi(text: str) -> str:`
			`match = re.search(r'([A-D]+)', text)`
			`if match:`
			`return match.group(1)`
			`return ''`