OpenCompass/opencompass/datasets/IFEval/instructions_util.py

150 lines
19 KiB
Python
Raw Normal View History

# flake8: noqa
# yapf: disable
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility library of instructions."""
import functools
import random
import re
try:
import immutabledict
except ImportError:
immutabledict = None
import nltk
WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war',
# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict({
'en': 'English',
'es': 'Spanish',
'pt': 'Portuguese',
'ar': 'Arabic',
'hi': 'Hindi',
'fr': 'French',
'ru': 'Russian',
'de': 'German',
'ja': 'Japanese',
'it': 'Italian',
'bn': 'Bengali',
'uk': 'Ukrainian',
'th': 'Thai',
'ur': 'Urdu',
'ta': 'Tamil',
'te': 'Telugu',
'bg': 'Bulgarian',
'ko': 'Korean',
'pl': 'Polish',
'he': 'Hebrew',
'fa': 'Persian',
'vi': 'Vietnamese',
'ne': 'Nepali',
'sw': 'Swahili',
'kn': 'Kannada',
'mr': 'Marathi',
'gu': 'Gujarati',
'pa': 'Punjabi',
'ml': 'Malayalam',
'fi': 'Finnish',
})
_ALPHABETS = '([A-Za-z])'
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
_SUFFIXES = '(Inc|Ltd|Jr|Sr|Co)'
_STARTERS = r'(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)'
_ACRONYMS = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
_WEBSITES = '[.](com|net|org|io|gov|edu|me)'
_DIGITS = '([0-9])'
_MULTIPLE_DOTS = r'\.{2,}'
def split_into_sentences(text):
"""Split the text into sentences.
Args:
text: A string that consists of more than or equal to one sentences.
Returns:
A list of strings where each string is a sentence.
"""
text = ' ' + text + ' '
text = text.replace('\n', ' ')
text = re.sub(_PREFIXES, '\\1<prd>', text)
text = re.sub(_WEBSITES, '<prd>\\1', text)
text = re.sub(_DIGITS + '[.]' + _DIGITS, '\\1<prd>\\2', text)
text = re.sub(
_MULTIPLE_DOTS,
lambda match: '<prd>' * len(match.group(0)) + '<stop>',
text,
)
if 'Ph.D' in text:
text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
text = re.sub(r'\s' + _ALPHABETS + '[.] ', ' \\1<prd> ', text)
text = re.sub(_ACRONYMS + ' ' + _STARTERS, '\\1<stop> \\2', text)
text = re.sub(
_ALPHABETS + '[.]' + _ALPHABETS + '[.]' + _ALPHABETS + '[.]',
'\\1<prd>\\2<prd>\\3<prd>',
text,
)
text = re.sub(_ALPHABETS + '[.]' + _ALPHABETS + '[.]', '\\1<prd>\\2<prd>',
text)
text = re.sub(' ' + _SUFFIXES + '[.] ' + _STARTERS, ' \\1<stop> \\2', text)
text = re.sub(' ' + _SUFFIXES + '[.]', ' \\1<prd>', text)
text = re.sub(' ' + _ALPHABETS + '[.]', ' \\1<prd>', text)
if '' in text:
text = text.replace('.”', '”.')
if '"' in text:
text = text.replace('."', '".')
if '!' in text:
text = text.replace('!"', '"!')
if '?' in text:
text = text.replace('?"', '"?')
text = text.replace('.', '.<stop>')
text = text.replace('?', '?<stop>')
text = text.replace('!', '!<stop>')
text = text.replace('<prd>', '.')
sentences = text.split('<stop>')
sentences = [s.strip() for s in sentences]
if sentences and not sentences[-1]:
sentences = sentences[:-1]
return sentences
def count_words(text):
"""Counts the number of words."""
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
num_words = len(tokens)
return num_words
@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
return nltk.data.load('nltk:tokenizers/punkt/english.pickle')
def count_sentences(text):
"""Count the number of sentences."""
tokenizer = _get_sentence_tokenizer()
tokenized_sentences = tokenizer.tokenize(text)
return len(tokenized_sentences)
def generate_keywords(num_keywords):
"""Randomly generates a few keywords."""
return random.sample(WORD_LIST, k=num_keywords)