OpenCompass/opencompass/datasets/xIFEval/instructions_util.py
2025-02-10 03:27:55 +01:00

1738 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility library of instructions."""
import functools
import random
import re
import immutabledict
import nltk
import pkg_resources
from packaging import version
# Downloading 'punkt' with nltk<3.9 has a remote code vuln.
# see https://github.com/EleutherAI/lm-evaluation-harness/issues/2210
# and https://github.com/nltk/nltk/issues/3266
# for more information.
NLTK_MIN_VERSION = "3.9.1"
def download_nltk_resources():
"""Download 'punkt' if not already installed"""
nltk_version = pkg_resources.get_distribution("nltk").version
assert (
version.parse(nltk_version) >= version.parse(NLTK_MIN_VERSION)
), f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab")
download_nltk_resources()
WORD_LIST = [
"western",
"sentence",
"signal",
"dump",
"spot",
"opposite",
"bottom",
"potato",
"administration",
"working",
"welcome",
"morning",
"good",
"agency",
"primary",
"wish",
"responsibility",
"press",
"problem",
"president",
"steal",
"brush",
"read",
"type",
"beat",
"trainer",
"growth",
"lock",
"bone",
"case",
"equal",
"comfortable",
"region",
"replacement",
"performance",
"mate",
"walk",
"medicine",
"film",
"thing",
"rock",
"tap",
"total",
"competition",
"ease",
"south",
"establishment",
"gather",
"parking",
"world",
"plenty",
"breath",
"claim",
"alcohol",
"trade",
"dear",
"highlight",
"street",
"matter",
"decision",
"mess",
"agreement",
"studio",
"coach",
"assist",
"brain",
"wing",
"style",
"private",
"top",
"brown",
"leg",
"buy",
"procedure",
"method",
"speed",
"high",
"company",
"valuable",
"pie",
"analyst",
"session",
"pattern",
"district",
"pleasure",
"dinner",
"swimming",
"joke",
"order",
"plate",
"department",
"motor",
"cell",
"spend",
"cabinet",
"difference",
"power",
"examination",
"engine",
"horse",
"dimension",
"pay",
"toe",
"curve",
"literature",
"bother",
"fire",
"possibility",
"debate",
"activity",
"passage",
"hello",
"cycle",
"background",
"quiet",
"author",
"effect",
"actor",
"page",
"bicycle",
"error",
"throat",
"attack",
"character",
"phone",
"tea",
"increase",
"outcome",
"file",
"specific",
"inspector",
"internal",
"potential",
"staff",
"building",
"employer",
"shoe",
"hand",
"direction",
"garden",
"purchase",
"interview",
"study",
"recognition",
"member",
"spiritual",
"oven",
"sandwich",
"weird",
"passenger",
"particular",
"response",
"reaction",
"size",
"variation",
"a",
"cancel",
"candy",
"exit",
"guest",
"condition",
"fly",
"price",
"weakness",
"convert",
"hotel",
"great",
"mouth",
"mind",
"song",
"sugar",
"suspect",
"telephone",
"ear",
"roof",
"paint",
"refrigerator",
"organization",
"jury",
"reward",
"engineering",
"day",
"possession",
"crew",
"bar",
"road",
"description",
"celebration",
"score",
"mark",
"letter",
"shower",
"suggestion",
"sir",
"luck",
"national",
"progress",
"hall",
"stroke",
"theory",
"offer",
"story",
"tax",
"definition",
"history",
"ride",
"medium",
"opening",
"glass",
"elevator",
"stomach",
"question",
"ability",
"leading",
"village",
"computer",
"city",
"grand",
"confidence",
"candle",
"priest",
"recommendation",
"point",
"necessary",
"body",
"desk",
"secret",
"horror",
"noise",
"culture",
"warning",
"water",
"round",
"diet",
"flower",
"bus",
"tough",
"permission",
"week",
"prompt",
"connection",
"abuse",
"height",
"save",
"corner",
"border",
"stress",
"drive",
"stop",
"rip",
"meal",
"listen",
"confusion",
"girlfriend",
"living",
"relation",
"significance",
"plan",
"creative",
"atmosphere",
"blame",
"invite",
"housing",
"paper",
"drink",
"roll",
"silver",
"drunk",
"age",
"damage",
"smoke",
"environment",
"pack",
"savings",
"influence",
"tourist",
"rain",
"post",
"sign",
"grandmother",
"run",
"profit",
"push",
"clerk",
"final",
"wine",
"swim",
"pause",
"stuff",
"singer",
"funeral",
"average",
"source",
"scene",
"tradition",
"personal",
"snow",
"nobody",
"distance",
"sort",
"sensitive",
"animal",
"major",
"negotiation",
"click",
"mood",
"period",
"arrival",
"expression",
"holiday",
"repeat",
"dust",
"closet",
"gold",
"bad",
"sail",
"combination",
"clothes",
"emphasis",
"duty",
"black",
"step",
"school",
"jump",
"document",
"professional",
"lip",
"chemical",
"front",
"wake",
"while",
"inside",
"watch",
"row",
"subject",
"penalty",
"balance",
"possible",
"adult",
"aside",
"sample",
"appeal",
"wedding",
"depth",
"king",
"award",
"wife",
"blow",
"site",
"camp",
"music",
"safe",
"gift",
"fault",
"guess",
"act",
"shame",
"drama",
"capital",
"exam",
"stupid",
"record",
"sound",
"swing",
"novel",
"minimum",
"ratio",
"machine",
"shape",
"lead",
"operation",
"salary",
"cloud",
"affair",
"hit",
"chapter",
"stage",
"quantity",
"access",
"army",
"chain",
"traffic",
"kick",
"analysis",
"airport",
"time",
"vacation",
"philosophy",
"ball",
"chest",
"thanks",
"place",
"mountain",
"advertising",
"red",
"past",
"rent",
"return",
"tour",
"house",
"construction",
"net",
"native",
"war",
"figure",
"fee",
"spray",
"user",
"dirt",
"shot",
"task",
"stick",
"friend",
"software",
"promotion",
"interaction",
"surround",
"block",
"purpose",
"practice",
"conflict",
"routine",
"requirement",
"bonus",
"hole",
"state",
"junior",
"sweet",
"catch",
"tear",
"fold",
"wall",
"editor",
"life",
"position",
"pound",
"respect",
"bathroom",
"coat",
"script",
"job",
"teach",
"birth",
"view",
"resolve",
"theme",
"employee",
"doubt",
"market",
"education",
"serve",
"recover",
"tone",
"harm",
"miss",
"union",
"understanding",
"cow",
"river",
"association",
"concept",
"training",
"recipe",
"relationship",
"reserve",
"depression",
"proof",
"hair",
"revenue",
"independent",
"lift",
"assignment",
"temporary",
"amount",
"loss",
"edge",
"track",
"check",
"rope",
"estimate",
"pollution",
"stable",
"message",
"delivery",
"perspective",
"mirror",
"assistant",
"representative",
"witness",
"nature",
"judge",
"fruit",
"tip",
"devil",
"town",
"emergency",
"upper",
"drop",
"stay",
"human",
"neck",
"speaker",
"network",
"sing",
"resist",
"league",
"trip",
"signature",
"lawyer",
"importance",
"gas",
"choice",
"engineer",
"success",
"part",
"external",
"worker",
"simple",
"quarter",
"student",
"heart",
"pass",
"spite",
"shift",
"rough",
"lady",
"grass",
"community",
"garage",
"youth",
"standard",
"skirt",
"promise",
"blind",
"television",
"disease",
"commission",
"positive",
"energy",
"calm",
"presence",
"tune",
"basis",
"preference",
"head",
"common",
"cut",
"somewhere",
"presentation",
"current",
"thought",
"revolution",
"effort",
"master",
"implement",
"republic",
"floor",
"principle",
"stranger",
"shoulder",
"grade",
"button",
"tennis",
"police",
"collection",
"account",
"register",
"glove",
"divide",
"professor",
"chair",
"priority",
"combine",
"peace",
"extension",
"maybe",
"evening",
"frame",
"sister",
"wave",
"code",
"application",
"mouse",
"match",
"counter",
"bottle",
"half",
"cheek",
"resolution",
"back",
"knowledge",
"make",
"discussion",
"screw",
"length",
"accident",
"battle",
"dress",
"knee",
"log",
"package",
"it",
"turn",
"hearing",
"newspaper",
"layer",
"wealth",
"profile",
"imagination",
"answer",
"weekend",
"teacher",
"appearance",
"meet",
"bike",
"rise",
"belt",
"crash",
"bowl",
"equivalent",
"support",
"image",
"poem",
"risk",
"excitement",
"remote",
"secretary",
"public",
"produce",
"plane",
"display",
"money",
"sand",
"situation",
"punch",
"customer",
"title",
"shake",
"mortgage",
"option",
"number",
"pop",
"window",
"extent",
"nothing",
"experience",
"opinion",
"departure",
"dance",
"indication",
"boy",
"material",
"band",
"leader",
"sun",
"beautiful",
"muscle",
"farmer",
"variety",
"fat",
"handle",
"director",
"opportunity",
"calendar",
"outside",
"pace",
"bath",
"fish",
"consequence",
"put",
"owner",
"go",
"doctor",
"information",
"share",
"hurt",
"protection",
"career",
"finance",
"force",
"golf",
"garbage",
"aspect",
"kid",
"food",
"boot",
"milk",
"respond",
"objective",
"reality",
"raw",
"ring",
"mall",
"one",
"impact",
"area",
"news",
"international",
"series",
"impress",
"mother",
"shelter",
"strike",
"loan",
"month",
"seat",
"anything",
"entertainment",
"familiar",
"clue",
"year",
"glad",
"supermarket",
"natural",
"god",
"cost",
"conversation",
"tie",
"ruin",
"comfort",
"earth",
"storm",
"percentage",
"assistance",
"budget",
"strength",
"beginning",
"sleep",
"other",
"young",
"unit",
"fill",
"store",
"desire",
"hide",
"value",
"cup",
"maintenance",
"nurse",
"function",
"tower",
"role",
"class",
"camera",
"database",
"panic",
"nation",
"basket",
"ice",
"art",
"spirit",
"chart",
"exchange",
"feedback",
"statement",
"reputation",
"search",
"hunt",
"exercise",
"nasty",
"notice",
"male",
"yard",
"annual",
"collar",
"date",
"platform",
"plant",
"fortune",
"passion",
"friendship",
"spread",
"cancer",
"ticket",
"attitude",
"island",
"active",
"object",
"service",
"buyer",
"bite",
"card",
"face",
"steak",
"proposal",
"patient",
"heat",
"rule",
"resident",
"broad",
"politics",
"west",
"knife",
"expert",
"girl",
"design",
"salt",
"baseball",
"grab",
"inspection",
"cousin",
"couple",
"magazine",
"cook",
"dependent",
"security",
"chicken",
"version",
"currency",
"ladder",
"scheme",
"kitchen",
"employment",
"local",
"attention",
"manager",
"fact",
"cover",
"sad",
"guard",
"relative",
"county",
"rate",
"lunch",
"program",
"initiative",
"gear",
"bridge",
"breast",
"talk",
"dish",
"guarantee",
"beer",
"vehicle",
"reception",
"woman",
"substance",
"copy",
"lecture",
"advantage",
"park",
"cold",
"death",
"mix",
"hold",
"scale",
"tomorrow",
"blood",
"request",
"green",
"cookie",
"church",
"strip",
"forever",
"beyond",
"debt",
"tackle",
"wash",
"following",
"feel",
"maximum",
"sector",
"sea",
"property",
"economics",
"menu",
"bench",
"try",
"language",
"start",
"call",
"solid",
"address",
"income",
"foot",
"senior",
"honey",
"few",
"mixture",
"cash",
"grocery",
"link",
"map",
"form",
"factor",
"pot",
"model",
"writer",
"farm",
"winter",
"skill",
"anywhere",
"birthday",
"policy",
"release",
"husband",
"lab",
"hurry",
"mail",
"equipment",
"sink",
"pair",
"driver",
"consideration",
"leather",
"skin",
"blue",
"boat",
"sale",
"brick",
"two",
"feed",
"square",
"dot",
"rush",
"dream",
"location",
"afternoon",
"manufacturer",
"control",
"occasion",
"trouble",
"introduction",
"advice",
"bet",
"eat",
"kill",
"category",
"manner",
"office",
"estate",
"pride",
"awareness",
"slip",
"crack",
"client",
"nail",
"shoot",
"membership",
"soft",
"anybody",
"web",
"official",
"individual",
"pizza",
"interest",
"bag",
"spell",
"profession",
"queen",
"deal",
"resource",
"ship",
"guy",
"chocolate",
"joint",
"formal",
"upstairs",
"car",
"resort",
"abroad",
"dealer",
"associate",
"finger",
"surgery",
"comment",
"team",
"detail",
"crazy",
"path",
"tale",
"initial",
"arm",
"radio",
"demand",
"single",
"draw",
"yellow",
"contest",
"piece",
"quote",
"pull",
"commercial",
"shirt",
"contribution",
"cream",
"channel",
"suit",
"discipline",
"instruction",
"concert",
"speech",
"low",
"effective",
"hang",
"scratch",
"industry",
"breakfast",
"lay",
"join",
"metal",
"bedroom",
"minute",
"product",
"rest",
"temperature",
"many",
"give",
"argument",
"print",
"purple",
"laugh",
"health",
"credit",
"investment",
"sell",
"setting",
"lesson",
"egg",
"middle",
"marriage",
"level",
"evidence",
"phrase",
"love",
"self",
"benefit",
"guidance",
"affect",
"you",
"dad",
"anxiety",
"special",
"boyfriend",
"test",
"blank",
"payment",
"soup",
"obligation",
"reply",
"smile",
"deep",
"complaint",
"addition",
"review",
"box",
"towel",
"minor",
"fun",
"soil",
"issue",
"cigarette",
"internet",
"gain",
"tell",
"entry",
"spare",
"incident",
"family",
"refuse",
"branch",
"can",
"pen",
"grandfather",
"constant",
"tank",
"uncle",
"climate",
"ground",
"volume",
"communication",
"kind",
"poet",
"child",
"screen",
"mine",
"quit",
"gene",
"lack",
"charity",
"memory",
"tooth",
"fear",
"mention",
"marketing",
"reveal",
"reason",
"court",
"season",
"freedom",
"land",
"sport",
"audience",
"classroom",
"law",
"hook",
"win",
"carry",
"eye",
"smell",
"distribution",
"research",
"country",
"dare",
"hope",
"whereas",
"stretch",
"library",
"if",
"delay",
"college",
"plastic",
"book",
"present",
"use",
"worry",
"champion",
"goal",
"economy",
"march",
"election",
"reflection",
"midnight",
"slide",
"inflation",
"action",
"challenge",
"guitar",
"coast",
"apple",
"campaign",
"field",
"jacket",
"sense",
"way",
"visual",
"remove",
"weather",
"trash",
"cable",
"regret",
"buddy",
"beach",
"historian",
"courage",
"sympathy",
"truck",
"tension",
"permit",
"nose",
"bed",
"son",
"person",
"base",
"meat",
"usual",
"air",
"meeting",
"worth",
"game",
"independence",
"physical",
"brief",
"play",
"raise",
"board",
"she",
"key",
"writing",
"pick",
"command",
"party",
"yesterday",
"spring",
"candidate",
"physics",
"university",
"concern",
"development",
"change",
"string",
"target",
"instance",
"room",
"bitter",
"bird",
"football",
"normal",
"split",
"impression",
"wood",
"long",
"meaning",
"stock",
"cap",
"leadership",
"media",
"ambition",
"fishing",
"essay",
"salad",
"repair",
"today",
"designer",
"night",
"bank",
"drawing",
"inevitable",
"phase",
"vast",
"chip",
"anger",
"switch",
"cry",
"twist",
"personality",
"attempt",
"storage",
"being",
"preparation",
"bat",
"selection",
"white",
"technology",
"contract",
"side",
"section",
"station",
"till",
"structure",
"tongue",
"taste",
"truth",
"difficulty",
"group",
"limit",
"main",
"move",
"feeling",
"light",
"example",
"mission",
"might",
"wait",
"wheel",
"shop",
"host",
"classic",
"alternative",
"cause",
"agent",
"consist",
"table",
"airline",
"text",
"pool",
"craft",
"range",
"fuel",
"tool",
"partner",
"load",
"entrance",
"deposit",
"hate",
"article",
"video",
"summer",
"feature",
"extreme",
"mobile",
"hospital",
"flight",
"fall",
"pension",
"piano",
"fail",
"result",
"rub",
"gap",
"system",
"report",
"suck",
"ordinary",
"wind",
"nerve",
"ask",
"shine",
"note",
"line",
"mom",
"perception",
"brother",
"reference",
"bend",
"charge",
"treat",
"trick",
"term",
"homework",
"bake",
"bid",
"status",
"project",
"strategy",
"orange",
"let",
"enthusiasm",
"parent",
"concentrate",
"device",
"travel",
"poetry",
"business",
"society",
"kiss",
"end",
"vegetable",
"employ",
"schedule",
"hour",
"brave",
"focus",
"process",
"movie",
"illegal",
"general",
"coffee",
"ad",
"highway",
"chemistry",
"psychology",
"hire",
"bell",
"conference",
"relief",
"show",
"neat",
"funny",
"weight",
"quality",
"club",
"daughter",
"zone",
"touch",
"tonight",
"shock",
"burn",
"excuse",
"name",
"survey",
"landscape",
"advance",
"satisfaction",
"bread",
"disaster",
"item",
"hat",
"prior",
"shopping",
"visit",
"east",
"photo",
"home",
"idea",
"father",
"comparison",
"cat",
"pipe",
"winner",
"count",
"lake",
"fight",
"prize",
"foundation",
"dog",
"keep",
"ideal",
"fan",
"struggle",
"peak",
"safety",
"solution",
"hell",
"conclusion",
"population",
"strain",
"alarm",
"measurement",
"second",
"train",
"race",
"due",
"insurance",
"boss",
"tree",
"monitor",
"sick",
"course",
"drag",
"appointment",
"slice",
"still",
"care",
"patience",
"rich",
"escape",
"emotion",
"royal",
"female",
"childhood",
"government",
"picture",
"will",
"sock",
"big",
"gate",
"oil",
"cross",
"pin",
"improvement",
"championship",
"silly",
"help",
"sky",
"pitch",
"man",
"diamond",
"most",
"transition",
"work",
"science",
"committee",
"moment",
"fix",
"teaching",
"dig",
"specialist",
"complex",
"guide",
"people",
"dead",
"voice",
"original",
"break",
"topic",
"data",
"degree",
"reading",
"recording",
"bunch",
"reach",
"judgment",
"lie",
"regular",
"set",
"painting",
"mode",
"list",
"player",
"bear",
"north",
"wonder",
"carpet",
"heavy",
"officer",
"negative",
"clock",
"unique",
"baby",
"pain",
"assumption",
"disk",
"iron",
"bill",
"drawer",
"look",
"double",
"mistake",
"finish",
"future",
"brilliant",
"contact",
"math",
"rice",
"leave",
"restaurant",
"discount",
"sex",
"virus",
"bit",
"trust",
"event",
"wear",
"juice",
"failure",
"bug",
"context",
"mud",
"whole",
"wrap",
"intention",
"draft",
"pressure",
"cake",
"dark",
"explanation",
"space",
"angle",
"word",
"efficiency",
"management",
"habit",
"star",
"chance",
"finding",
"transportation",
"stand",
"criticism",
"flow",
"door",
"injury",
"insect",
"surprise",
"apartment",
] # pylint: disable=line-too-long
# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict(
{
"en": "English",
"es": "Spanish",
"pt": "Portuguese",
"ar": "Arabic",
"hi": "Hindi",
"fr": "French",
"ru": "Russian",
"de": "German",
"ja": "japanese",
"it": "Italian",
"bn": "Bengali",
"uk": "Ukrainian",
"th": "Thai",
"ur": "Urdu",
"ta": "Tamil",
"te": "Telugu",
"bg": "Bulgarian",
"ko": "Korean",
"pl": "Polish",
"he": "Hebrew",
"fa": "Persian",
"vi": "Vietnamese",
"ne": "Nepali",
"sw": "Swahili",
"kn": "Kannada",
"mr": "Marathi",
"gu": "Gujarati",
"pa": "Punjabi",
"ml": "Malayalam",
"fi": "Finnish",
}
)
_ALPHABETS = "([A-Za-z])"
_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
_DIGITS = "([0-9])"
_MULTIPLE_DOTS = r"\.{2,}"
def split_into_sentences(text):
"""Split the text into sentences.
Args:
text: A string that consists of more than or equal to one sentences.
Returns:
A list of strings where each string is a sentence.
"""
text = " " + text + " "
text = text.replace("\n", " ")
text = re.sub(_PREFIXES, "\\1<prd>", text)
text = re.sub(_WEBSITES, "<prd>\\1", text)
text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
text = re.sub(
_MULTIPLE_DOTS,
lambda match: "<prd>" * len(match.group(0)) + "<stop>",
text,
)
if "Ph.D" in text:
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
text = re.sub(
_ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
"\\1<prd>\\2<prd>\\3<prd>",
text,
)
text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
if "" in text:
text = text.replace(".”", "”.")
if '"' in text:
text = text.replace('."', '".')
if "!" in text:
text = text.replace('!"', '"!')
if "?" in text:
text = text.replace('?"', '"?')
text = text.replace(".", ".<stop>")
text = text.replace("?", "?<stop>")
text = text.replace("!", "!<stop>")
text = text.replace("<prd>", ".")
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences]
if sentences and not sentences[-1]:
sentences = sentences[:-1]
return sentences
def count_words(text):
"""Counts the number of words."""
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(text)
num_words = len(tokens)
return num_words
def count_chinese_words(text):
from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh
tokenizer = TokenizerZh()
return len(tokenizer(text.strip()).split())
def count_japanese_words(text):
from sacrebleu.tokenizers.tokenizer_ja_mecab import TokenizerJaMecab
tokenizer = TokenizerJaMecab()
return len(tokenizer(text.strip()).split())
def count_korean_words(text):
from sacrebleu.tokenizers.tokenizer_ko_mecab import TokenizerKoMecab
tokenizer = TokenizerKoMecab()
return len(tokenizer(text.strip()).split())
def count_words_by_space(text):
return len(text.strip().split())
def count_words_by_spm(text):
from sacrebleu.tokenizers.tokenizer_spm import Flores101Tokenizer
tokenizer = Flores101Tokenizer()
return len(tokenizer(text.strip()).split())
@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
def count_sentences(text):
"""Count the number of sentences."""
tokenizer = _get_sentence_tokenizer()
tokenized_sentences = tokenizer.tokenize(text)
return len(tokenized_sentences)
def count_chinese_sentences(text):
text = re.sub(r"([。!!\?]|\.{6}|……)\s*([^”’\"\s])", r"\1\n\2", text)
text = re.sub(r"(([。!!\?]|\.{6}|……)\s*[”’\"」])\s*([^,,。!!\?\s])", r"\1\n\3", text)
return len(text.strip().split("\n"))
def count_thai_sentences(text):
from pythainlp import sent_tokenize
return len(sent_tokenize(text))
def count_bengali_sentences(text):
text = re.sub(r"([!\?।]|\.{6})\s*([^”’\"\s])", r"\1\n\2", text)
text = re.sub(r"(([!\?।]|\.{6})\s*[”’\"])\s*([^,!\?।\s])", r"\1\n\3", text)
return len(text.strip().split("\n"))
def count_korean_sentences(text):
import kss
return len(kss.split_sentences(text))
def generate_keywords(num_keywords):
"""Randomly generates a few keywords."""
return random.sample(WORD_LIST, k=num_keywords)