This commit is contained in:
MaiziXiao 2025-05-27 08:39:17 +00:00
parent f93668337a
commit f92dca7444
32 changed files with 112 additions and 155 deletions

View File

@ -50,8 +50,9 @@ for m in _origin_models:
datasets = teval_en_datasets + teval_zh_datasets
work_dir = './outputs/teval'
'''
dataset version metric mode qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf
"""Dataset version metric mode
qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf.
------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
teval - naive_average unknown 57.69 78.18 36.63
teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
@ -77,4 +78,4 @@ teval-reason_retrieve_understand_json_v1_zh 10482d name unknown
teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
'''
"""

View File

@ -22,14 +22,18 @@ humaneval_multi_eval_cfg = {
pred_role='BOT',
) for lang in ['cpp', 'cs', 'd', 'go', 'java', 'jl', 'js', 'lua', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala', 'sh', 'swift', 'ts']
}
"""There are four versions of humaneval-{LANG}-{version}.jsonl:
'''there are four versions of humaneval-{LANG}-{version}.jsonl:
['keep', 'transform', 'reworded', 'remove']
SRCDATA-LANG-keep is the same as SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original prompt had Python doctests, they remain as Python instead of being translated to LANG. If the original prompt had Python-specific terminology, e.g., 'list', it remains 'list', instead of being translated, e.g., to 'vector' for C++.
SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural language text of the prompt unchanged.
SRCDATA-LANG-reworded transforms both the doctests and the natural language text of the prompt to LANG.
SRCDATA-LANG-remove removes the doctests from the prompt.
'''
['keep', 'transform', 'reworded', 'remove'] SRCDATA-LANG-keep is the same as
SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original
prompt had Python doctests, they remain as Python instead of being translated
to LANG. If the original prompt had Python-specific terminology, e.g., 'list',
it remains 'list', instead of being translated, e.g., to 'vector' for C++.
SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural
language text of the prompt unchanged. SRCDATA-LANG-reworded transforms both
the doctests and the natural language text of the prompt to LANG. SRCDATA-LANG-
remove removes the doctests from the prompt.
"""
humaneval_multi_datasets = [
dict(

View File

@ -53,22 +53,18 @@ summarizer = dict(
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'ceval',
'ceval-stem',
'ceval-social-science',

View File

@ -157,7 +157,6 @@ summarizer = dict(
'Accuracy Average',
'English Average',
'Chinese Average',
'###### CALM-Lite Errors ######',
'Same response to all questions Average',
'Language inconsistency Average',

View File

@ -29,14 +29,12 @@ summarizer = dict(
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
],

View File

@ -80,7 +80,6 @@ summarizer = dict(
'mathbench-a-middle-en',
'mathbench-a-primary-en',
'mathbench-a-en-average',
'#########################################################',
'###### MathBench-T-CN: Theory Part (Chinese) ############',
'mathbench-t-college-cn',
@ -88,14 +87,12 @@ summarizer = dict(
'mathbench-t-middle-cn',
'mathbench-t-primary-cn',
'mathbench-t-cn-average',
'###### MathBench-T-EN: Theory Part (English) ############',
'mathbench-t-college-en',
'mathbench-t-high-en',
'mathbench-t-middle-en',
'mathbench-t-primary-en',
'mathbench-t-en-average',
'#########################################################',
'###### MathBench-CN ############',
'college-cn',
@ -103,7 +100,6 @@ summarizer = dict(
'middle-cn',
'primary-cn',
'cn-avarage',
'###### MathBench-EN ############',
'college-en',
'high-en',

View File

@ -320,14 +320,14 @@ class BulletListChecker(Instruction):
def check_following(self, value):
r"""Check if the number of bullet lists meets the requirement.
Args:
value: A string representing the response. The response is expected to
contain some bullet lists that start with `\*`.
Args:
value: A string representing the response. The response is expected to
contain some bullet lists that start with `\*`.
Returns:
True if the actual number of bullet lists in the response meets the
requirement.
"""
Returns:
True if the actual number of bullet lists in the response meets the
requirement.
"""
bullet_lists = re.findall(r'^\s*\*[^\*].*$', value, flags=re.MULTILINE)
bullet_lists_2 = re.findall(r'^\s*-.*$', value, flags=re.MULTILINE)
num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
@ -687,14 +687,14 @@ class RephraseChecker(Instruction):
def check_following(self, value):
r"""Checks if the rephrasing follows the instruction.
Args:
value: A string representing the response, which is expected to rephras
the string of `instruction_args`.
Args:
value: A string representing the response, which is expected to rephras
the string of `instruction_args`.
Returns:
True if `value` and `instruction_args` only differ by the words/sentences
in between two asterisks such as *change me*; otherwise, False.
"""
Returns:
True if `value` and `instruction_args` only differ by the words/sentences
in between two asterisks such as *change me*; otherwise, False.
"""
if not self.is_change(value):
raise ValueError(f'value {value} does not contain '
@ -930,17 +930,17 @@ class ParagraphFirstWordCheck(Instruction):
first_word=None):
r"""Build the instruction description.
Args:
num_paragraphs: An integer indicating the number of paragraphs expected
in the response. A paragraph is a subset of the string that is
expected to be separated by '\n\n'.
nth_paragraph: An integer indicating the paragraph number that we look at.
Note that n starts from 1.
first_word: A string that represent the first word of the bth paragraph.
Args:
num_paragraphs: An integer indicating the number of paragraphs expected
in the response. A paragraph is a subset of the string that is
expected to be separated by '\n\n'.
nth_paragraph: An integer indicating the paragraph number that we look at.
Note that n starts from 1.
first_word: A string that represent the first word of the bth paragraph.
Returns:
A string representing the instruction description.
"""
Returns:
A string representing the instruction description.
"""
self._num_paragraphs = num_paragraphs
if self._num_paragraphs is None or self._num_paragraphs < 0:
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)

View File

@ -422,11 +422,9 @@ class MathJudger:
expression2: str,
include_percentage: bool = True,
):
"""
(默认 expression1 Ground_Truth)
函数: 判读两个数值是否在误差允许范围内相等
步骤1: 将可能出现的百分号的情况包含进来
步骤2: 使用 math.isclose 函数判断是否相等
"""(默认 expression1 为 Ground_Truth) 函数: 判读两个数值是否在误差允许范围内相等 步骤1:
将可能出现的百分号的情况包含进来 步骤2: 使用 math.isclose 函数判断是否相等.
"""
reference = float(expression1)
prediction = float(expression2)
@ -442,12 +440,8 @@ class MathJudger:
return False
def expression_equal(self, exp1, exp2):
"""
(默认 expression1 Ground_Truth)
函数: 判断两个表达式是否在数学意义上等价
步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
步骤2: 使用 sympy 库进行等价判断
"""
"""(默认 expression1 为 Ground_Truth) 函数: 判断两个表达式是否在数学意义上等价 步骤1: 提取表达式,
防止有的模型会给出"x=1"而不是"1" 步骤2: 使用 sympy 库进行等价判断."""
# 只提取等号右边的表达式
def extract_expression(expression):

View File

@ -25,11 +25,7 @@ langs_dict = {
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
r"""{ "answer": "A|B|C|D" }"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",

View File

@ -25,11 +25,7 @@ langs_dict = {
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
r"""{ "answer": "A|B|C|D" }"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",

View File

@ -25,11 +25,7 @@ langs_dict = {
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
r"""{ "answer": "A|B|C|D" }"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",

View File

@ -26,11 +26,7 @@ langs_dict = {
def extract_choice(gen, lang):
r"""
{
"answer": "A|B|C|D"
}
"""
r"""{ "answer": "A|B|C|D" }"""
patterns = [
r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",

View File

@ -342,7 +342,7 @@ class Capturing(list):
def run_test(sample, test=None, debug=False):
"""if test(generated_code) is not None it'll try to run the code.
"""If test(generated_code) is not None it'll try to run the code.
otherwise it'll just return an input and output pair.
"""

View File

@ -66,8 +66,8 @@ def codeexecute_check_correctness(check_program, timeout=3):
"""Evaluates the functional correctness of a completion by running the test
suite provided in the problem.
:param completion_id: an optional completion ID so we can match
the results later even if execution finishes asynchronously.
:param completion_id: an optional completion ID so we can match the results
later even if execution finishes asynchronously.
"""
manager = multiprocessing.Manager()
result = manager.list()

View File

@ -135,7 +135,7 @@ def get_generic_question_template_test_completion(question_content,
testcase_input: str):
def format_testcase_func_name_input(function_name, testcase):
"""use the form of "assert func_name(input) == "."""
"""Use the form of "assert func_name(input) == "."""
# TODO should there be a space after the == ?
input_str = ', '.join(testcase.split('\n'))
return f'assert {function_name}({input_str}) == # TODO'

View File

@ -79,7 +79,7 @@ def combined_int_check(val):
def run_test(sample, test=None, debug=False, timeout=6):
"""if test(generated_code) is not None it'll try to run the code.
"""If test(generated_code) is not None it'll try to run the code.
otherwise it'll just return an input and output pair.
"""

View File

@ -512,7 +512,7 @@ class MATHEvaluator(BaseEvaluator):
@ICL_EVALUATORS.register_module()
class MATHAgentEvaluator(MATHEvaluator):
"""math agent evaluator for soft condition.
"""Math agent evaluator for soft condition.
Args:
action (str): Action for catching internal prediction.

View File

@ -38,7 +38,8 @@ class LogicNodeFactType:
class LogicNodeConstraints:
"""Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though."""
"""Useful for things like children = ['X is the murderer', 'Y is the
murderer', 'Z is the murderer'], we no longer use this structure though."""
ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true'
@ -244,8 +245,10 @@ class LogicTree:
explicit leaf nodes.
:param include_cs: Include the commonsense nodes from all levels.
:param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper.
:param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes.
:param include_deductions_from_level: Include any intermediate
deduction nodes from the specified level and deeper.
:param no_facts_after_depth: Essentially tree the deductions at the
specified depth as leaf nodes.
"""
def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]:
@ -302,16 +305,21 @@ class LogicTree:
However, more complex arguments can be used to control what is printed.
This returns a string that must be printed (don't be confused by the method name.)
This returns a string that must be printed (don't be confused by the
method name.)
:param node: Start at a specific node.
:param level: Controls how much tabbing is done when printing the current node.
:param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example)
:param level: Controls how much tabbing is done when printing the
current node.
:param pad_char: Char to use that specifies depth ('> ' at depth 3 will
look like '> > > ' if you have pad_space equal to 1 for example)
:param pad_space: How many spaces to include between pad_chars
:param print_forward: Print the tree with parent nodes first.
:param print_conjection_types: Print the Ands and Ors per deduction (not used)
:param print_conjection_types: Print the Ands and Ors per deduction
(not used)
:param print_reasoning_types: Print the deduction types (not used)
:param ignore_value_after_depth: Ignore content of the nodes once a depth is met
:param ignore_value_after_depth: Ignore content of the nodes once a
depth is met
:param print_only_nodes_with_value: Ignore nodes without content.
"""

View File

@ -42,7 +42,8 @@ class AlpacaEvalDataset(BaseDataset):
def post_process_alpacav2(completion: str):
r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1.
r"""Parse a completion that contains 'm' or 'M' and returns the rank of the
model1.
Examples
--------

View File

@ -292,7 +292,7 @@ class Capturing(list):
def run_test(sample, test=None, debug=False):
"""if test(generated_code) is not None it'll try to run the code.
"""If test(generated_code) is not None it'll try to run the code.
otherwise it'll just return an input and output pair.
"""

View File

@ -128,18 +128,11 @@ class ERNIEBot(BaseAPIModel):
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
"""
{
"messages": [
{"role":"user","content":"请介绍一下你自己"},
{"role":"assistant","content":"我是百度公司开发的人工智能语言模型"},
{"role":"user","content": "我在上海,周末可以去哪里玩?"},
{"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
{"role":"user","content": "周末这里的天气怎么样?"}
]
}
"""
"""{ "messages": [ {"role":"user","content":"请介绍一下你自己"},
{"role":"assistant","content":"我是百度公司开发的人工智能语言模型"},
{"role":"user","content": "我在上海,周末可以去哪里玩?"},
{"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
{"role":"user","content": "周末这里的天气怎么样?"} ] }"""
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]

View File

@ -86,17 +86,11 @@ class Qwen(BaseAPIModel):
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
"""
{
"messages": [
{"role":"user","content":"请介绍一下你自己"},
{"role":"assistant","content":"我是通义千问"},
{"role":"user","content": "我在上海,周末可以去哪里玩?"},
{"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
{"role":"user","content": "周末这里的天气怎么样?"}
]
}
"""{ "messages": [ {"role":"user","content":"请介绍一下你自己"},
{"role":"assistant","content":"我是通义千问"}, {"role":"user","content":
"我在上海,周末可以去哪里玩?"}, {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
{"role":"user","content": "周末这里的天气怎么样?"} ] }
"""
if isinstance(input, str):

View File

@ -13,7 +13,7 @@ PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
"""Decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:

View File

@ -10,7 +10,7 @@ PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
"""Decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:

View File

@ -16,7 +16,7 @@ PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
"""Decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:

View File

@ -40,10 +40,7 @@ def get_current_time(format='%Y-%m-%d %H:%M:%S'):
def get_current_timestamp():
"""
获取当前时间时间戳
:return:
"""
"""获取当前时间时间戳 :return:"""
timestamp_str = int(round(time.time() * 1000))
return str(timestamp_str)
@ -59,10 +56,7 @@ def encode_base64_string(s):
def get_current_time_gmt_format():
"""
获取当前时间的GMT 时间
:return:
"""
"""获取当前时间的GMT 时间 :return:"""
GMT_FORMAT = '%a, %d %b %Y %H:%M:%SGMT+00:00'
now = datetime.now()
time_str = now.strftime(GMT_FORMAT)

View File

@ -12,9 +12,9 @@ from .icl_base_evaluator import BaseEvaluator
class AUCROCEvaluator(BaseEvaluator):
"""Calculate AUC-ROC scores and accuracy according the prediction.
For some dataset, the accuracy cannot reveal the difference between
models because of the saturation. AUC-ROC scores can further exam
model abilities to distinguish different labels. More details can refer to
For some dataset, the accuracy cannot reveal the difference between models
because of the saturation. AUC-ROC scores can further exam model abilities
to distinguish different labels. More details can refer to
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
""" # noqa

View File

@ -283,25 +283,19 @@ class MultiModelSummarizer:
self.models_summary_group_metrics[new_model_name] = summarizer.models_summary_group_metrics[new_model_name]
def summarize(self):
"""
Format in self.table
[
['dataset', 'version', 'metric', 'mode', 'model_name'],
['--------- 考试 Exam ---------', '-', '-', '-', '-'],
['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'],
['ARC-e', '1e0de5', 'accuracy', 'gen', '85.36'],
['--------- 语言 Language ---------', '-', '-', '-', '-'],
['WiC', 'd06864', 'accuracy', 'gen', '55.64'],
['chid-dev', '211ee7', 'accuracy', 'gen', '52.97'],
['--------- 知识 Knowledge ---------', '-', '-', '-', '-'],
['BoolQ', '883d50', 'accuracy', 'gen', '86.06'],
['--------- 理解 Understanding ---------', '-', '-', '-', '-'],
['C3', '8c358f', 'accuracy', 'gen', '88.33'],
['race-middle', '9a54b6', 'accuracy', 'gen', '90.32'],
['--------- 推理 Reasoning ---------', '-', '-', '-', '-'],
['cmnli', '1abf97', 'accuracy', 'gen', '38.26'],
['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'],
]
"""Format in self.table [ ['dataset', 'version', 'metric', 'mode',
'model_name'], ['--------- 考试 Exam ---------', '-', '-', '-', '-'],
['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'], ['ARC-e', '1e0de5',
'accuracy', 'gen', '85.36'], ['--------- 语言 Language ---------', '-',
'-', '-', '-'], ['WiC', 'd06864', 'accuracy', 'gen', '55.64'], ['chid-
dev', '211ee7', 'accuracy', 'gen', '52.97'], ['--------- 知识 Knowledge
---------', '-', '-', '-', '-'], ['BoolQ', '883d50', 'accuracy', 'gen',
'86.06'], ['--------- 理解 Understanding ---------', '-', '-', '-', '-'],
['C3', '8c358f', 'accuracy', 'gen', '88.33'], ['race-middle', '9a54b6',
'accuracy', 'gen', '90.32'], ['--------- 推理 Reasoning ---------', '-',
'-', '-', '-'], ['cmnli', '1abf97', 'accuracy', 'gen', '38.26'],
['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'], ]
"""
table = Table()

View File

@ -19,7 +19,8 @@ from .utils import get_judgeanswer_and_reference, get_outdir
def post_process_alpacav1(completion: str):
r"""Parse a completion that contains a list of dictionary and returns the rank of the model1.
r"""Parse a completion that contains a list of dictionary and returns the
rank of the model1.
Examples
--------
@ -47,7 +48,8 @@ def post_process_alpacav1(completion: str):
def post_process_alpacav2(completion: str):
r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1.
r"""Parse a completion that contains 'm' or 'M' and returns the rank of the
model1.
Examples
--------

View File

@ -61,7 +61,7 @@ def preprocess_for_elo(df):
def preprocess_for_bt(df):
"""in BT we only need the unique (matchup,outcome) sets along with the
"""In BT we only need the unique (matchup,outcome) sets along with the
weights of how often they occur."""
n_rows = len(df)
# the 3 columns of schedule represent: model_a id, model_b id, outcome_id
@ -179,7 +179,7 @@ def fit_vectorized_elo(
init_rating: float = 1000.0,
scale: float = 400.0,
):
"""fit multiple sets of Elo ratings on different samples of the data at the
"""Fit multiple sets of Elo ratings on different samples of the data at the
same time."""
alpha = math.log(base) / scale
num_samples = sample_indices.shape[1]
@ -282,7 +282,7 @@ def scale_and_offset(
baseline_model: str = None,
baseline_rating: float = 1000.0,
):
"""convert ratings from the natural scale to the Elo rating scale with an
"""Convert ratings from the natural scale to the Elo rating scale with an
anchored baseline."""
scaled_ratings = (ratings * scale) + init_rating

View File

@ -50,7 +50,6 @@ MAP = {
'language/compassbench_v2501_language_en_nlp_sub',
'language/compassbench_v2501_language_en_creation_sub',
],
'code': [
'总分',
'中文总分',

View File

@ -15,8 +15,8 @@ class LarkReporter:
title: Optional[str] = None):
"""Post a message to Lark.
When title is None, message must be a str.
otherwise msg can be in rich text format (see
When title is None, message must be a str. otherwise msg can be in rich
text format (see
https://open.feishu.cn/document/uAjLw4CM/ukTMukTMukTM/im-v1/message/create_json#45e0953e
for details).
"""