OpenCompass/opencompass/datasets/IFEval/instructions_util.py

# flake8: noqa
# yapf: disable

# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility library of instructions."""

import functools
import random
import re

try:
    import immutabledict
except ImportError:
    immutabledict = None
import nltk

WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war',

# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict({
    'en': 'English',
    'es': 'Spanish',
    'pt': 'Portuguese',
    'ar': 'Arabic',
    'hi': 'Hindi',
    'fr': 'French',
    'ru': 'Russian',
    'de': 'German',
    'ja': 'Japanese',
    'it': 'Italian',
    'bn': 'Bengali',
    'uk': 'Ukrainian',
    'th': 'Thai',
    'ur': 'Urdu',
    'ta': 'Tamil',
    'te': 'Telugu',
    'bg': 'Bulgarian',
    'ko': 'Korean',
    'pl': 'Polish',
    'he': 'Hebrew',
    'fa': 'Persian',
    'vi': 'Vietnamese',
    'ne': 'Nepali',
    'sw': 'Swahili',
    'kn': 'Kannada',
    'mr': 'Marathi',
    'gu': 'Gujarati',
    'pa': 'Punjabi',
    'ml': 'Malayalam',
    'fi': 'Finnish',
})

_ALPHABETS = '([A-Za-z])'
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
_SUFFIXES = '(Inc|Ltd|Jr|Sr|Co)'
_STARTERS = r'(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)'
_ACRONYMS = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
_WEBSITES = '[.](com|net|org|io|gov|edu|me)'
_DIGITS = '([0-9])'
_MULTIPLE_DOTS = r'\.{2,}'


def split_into_sentences(text):
    """Split the text into sentences.

    Args:
      text: A string that consists of more than or equal to one sentences.

    Returns:
      A list of strings where each string is a sentence.
    """
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(_PREFIXES, '\\1<prd>', text)
    text = re.sub(_WEBSITES, '<prd>\\1', text)
    text = re.sub(_DIGITS + '[.]' + _DIGITS, '\\1<prd>\\2', text)
    text = re.sub(
        _MULTIPLE_DOTS,
        lambda match: '<prd>' * len(match.group(0)) + '<stop>',
        text,
    )
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub(r'\s' + _ALPHABETS + '[.] ', ' \\1<prd> ', text)
    text = re.sub(_ACRONYMS + ' ' + _STARTERS, '\\1<stop> \\2', text)
    text = re.sub(
        _ALPHABETS + '[.]' + _ALPHABETS + '[.]' + _ALPHABETS + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(_ALPHABETS + '[.]' + _ALPHABETS + '[.]', '\\1<prd>\\2<prd>',
                  text)
    text = re.sub(' ' + _SUFFIXES + '[.] ' + _STARTERS, ' \\1<stop> \\2', text)
    text = re.sub(' ' + _SUFFIXES + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + _ALPHABETS + '[.]', ' \\1<prd>', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]:
        sentences = sentences[:-1]
    return sentences


def count_words(text):
    """Counts the number of words."""
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    num_words = len(tokens)
    return num_words


@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
    return nltk.data.load('nltk:tokenizers/punkt/english.pickle')


def count_sentences(text):
    """Count the number of sentences."""
    tokenizer = _get_sentence_tokenizer()
    tokenized_sentences = tokenizer.tokenize(text)
    return len(tokenized_sentences)


def generate_keywords(num_keywords):
    """Randomly generates a few keywords."""
    return random.sample(WORD_LIST, k=num_keywords)
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`# flake8: noqa`
			`# yapf: disable`

			`# Copyright 2023 The Google Research Authors.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Utility library of instructions."""`

			`import functools`
			`import random`
			`import re`

[Fix] Fix IFEval (#906) * fix ifeval * fix ifeval * fix ifeval * fix ifeval 2024-02-22 16:51:34 +08:00			`try:`
			`import immutabledict`
			`except ImportError:`
			`immutabledict = None`
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`import nltk`

			WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war',

			`# ISO 639-1 codes to language names.`
			`LANGUAGE_CODES = immutabledict.immutabledict({`
			`'en': 'English',`
			`'es': 'Spanish',`
			`'pt': 'Portuguese',`
			`'ar': 'Arabic',`
			`'hi': 'Hindi',`
			`'fr': 'French',`
			`'ru': 'Russian',`
			`'de': 'German',`
			`'ja': 'Japanese',`
			`'it': 'Italian',`
			`'bn': 'Bengali',`
			`'uk': 'Ukrainian',`
			`'th': 'Thai',`
			`'ur': 'Urdu',`
			`'ta': 'Tamil',`
			`'te': 'Telugu',`
			`'bg': 'Bulgarian',`
			`'ko': 'Korean',`
			`'pl': 'Polish',`
			`'he': 'Hebrew',`
			`'fa': 'Persian',`
			`'vi': 'Vietnamese',`
			`'ne': 'Nepali',`
			`'sw': 'Swahili',`
			`'kn': 'Kannada',`
			`'mr': 'Marathi',`
			`'gu': 'Gujarati',`
			`'pa': 'Punjabi',`
			`'ml': 'Malayalam',`
			`'fi': 'Finnish',`
			`})`

			`_ALPHABETS = '([A-Za-z])'`
			`_PREFIXES = '(Mr\|St\|Mrs\|Ms\|Dr)[.]'`
			`_SUFFIXES = '(Inc\|Ltd\|Jr\|Sr\|Co)'`
			`_STARTERS = r'(Mr\|Mrs\|Ms\|Dr\|Prof\|Capt\|Cpt\|Lt\|He\s\|She\s\|It\s\|They\s\|Their\s\|Our\s\|We\s\|But\s\|However\s\|That\s\|This\s\|Wherever)'`
			`_ACRONYMS = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'`
			`_WEBSITES = '[.](com\|net\|org\|io\|gov\|edu\|me)'`
			`_DIGITS = '([0-9])'`
			`_MULTIPLE_DOTS = r'\.{2,}'`


			`def split_into_sentences(text):`
			`"""Split the text into sentences.`

			`Args:`
			`text: A string that consists of more than or equal to one sentences.`

			`Returns:`
			`A list of strings where each string is a sentence.`
			`"""`
			`text = ' ' + text + ' '`
			`text = text.replace('\n', ' ')`
			`text = re.sub(_PREFIXES, '\\1<prd>', text)`
			`text = re.sub(_WEBSITES, '<prd>\\1', text)`
			`text = re.sub(_DIGITS + '[.]' + _DIGITS, '\\1<prd>\\2', text)`
			`text = re.sub(`
			`_MULTIPLE_DOTS,`
			`lambda match: '<prd>' * len(match.group(0)) + '<stop>',`
			`text,`
			`)`
			`if 'Ph.D' in text:`
			`text = text.replace('Ph.D.', 'Ph<prd>D<prd>')`
			`text = re.sub(r'\s' + _ALPHABETS + '[.] ', ' \\1<prd> ', text)`
			`text = re.sub(_ACRONYMS + ' ' + _STARTERS, '\\1<stop> \\2', text)`
			`text = re.sub(`
			`_ALPHABETS + '[.]' + _ALPHABETS + '[.]' + _ALPHABETS + '[.]',`
			`'\\1<prd>\\2<prd>\\3<prd>',`
			`text,`
			`)`
			`text = re.sub(_ALPHABETS + '[.]' + _ALPHABETS + '[.]', '\\1<prd>\\2<prd>',`
			`text)`
			`text = re.sub(' ' + _SUFFIXES + '[.] ' + _STARTERS, ' \\1<stop> \\2', text)`
			`text = re.sub(' ' + _SUFFIXES + '[.]', ' \\1<prd>', text)`
			`text = re.sub(' ' + _ALPHABETS + '[.]', ' \\1<prd>', text)`
			`if '”' in text:`
			`text = text.replace('.”', '”.')`
			`if '"' in text:`
			`text = text.replace('."', '".')`
			`if '!' in text:`
			`text = text.replace('!"', '"!')`
			`if '?' in text:`
			`text = text.replace('?"', '"?')`
			`text = text.replace('.', '.<stop>')`
			`text = text.replace('?', '?<stop>')`
			`text = text.replace('!', '!<stop>')`
			`text = text.replace('<prd>', '.')`
			`sentences = text.split('<stop>')`
			`sentences = [s.strip() for s in sentences]`
			`if sentences and not sentences[-1]:`
			`sentences = sentences[:-1]`
			`return sentences`


			`def count_words(text):`
			`"""Counts the number of words."""`
			`tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')`
			`tokens = tokenizer.tokenize(text)`
			`num_words = len(tokens)`
			`return num_words`


			`@functools.lru_cache(maxsize=None)`
			`def _get_sentence_tokenizer():`
			`return nltk.data.load('nltk:tokenizers/punkt/english.pickle')`


			`def count_sentences(text):`
			`"""Count the number of sentences."""`
			`tokenizer = _get_sentence_tokenizer()`
			`tokenized_sentences = tokenizer.tokenize(text)`
			`return len(tokenized_sentences)`


			`def generate_keywords(num_keywords):`
			`"""Randomly generates a few keywords."""`
			`return random.sample(WORD_LIST, k=num_keywords)`