From f92dca7444dd6969f66e41647b68c0b85ab10944 Mon Sep 17 00:00:00 2001 From: MaiziXiao Date: Tue, 27 May 2025 08:39:17 +0000 Subject: [PATCH] fix lint --- examples/eval_teval.py | 7 +-- .../humaneval_multi_gen_82cf85.py | 18 ++++--- opencompass/configs/summarizers/chat_OC15.py | 4 -- .../configs/summarizers/groups/calm.py | 1 - .../summarizers/groups/mathbench_v1_2024.py | 2 - .../groups/mathbench_v1_2024_lang.py | 4 -- opencompass/datasets/IFEval/instructions.py | 48 +++++++++---------- opencompass/datasets/OlympiadBench.py | 16 ++----- opencompass/datasets/PMMEval/mhellaswag.py | 6 +-- opencompass/datasets/PMMEval/mlogiqa.py | 6 +-- opencompass/datasets/PMMEval/mmmlu.py | 6 +-- opencompass/datasets/PMMEval/xnli.py | 6 +-- opencompass/datasets/apps.py | 2 +- .../datasets/livecodebench/execute_utils.py | 4 +- opencompass/datasets/livecodebench/prompts.py | 2 +- .../datasets/livecodebench/testing_util.py | 2 +- opencompass/datasets/math.py | 2 +- opencompass/datasets/musr/tree.py | 24 ++++++---- opencompass/datasets/subjective/alpacaeval.py | 3 +- opencompass/datasets/taco.py | 2 +- opencompass/models/baidu_api.py | 17 ++----- opencompass/models/qwen_api.py | 14 ++---- opencompass/models/turbomind.py | 2 +- opencompass/models/turbomind_api.py | 2 +- .../models/turbomind_with_tf_above_v4_33.py | 2 +- opencompass/models/yayi_api.py | 10 +--- .../icl_evaluator/icl_aucroc_evaluator.py | 6 +-- opencompass/summarizers/multi_model.py | 32 +++++-------- .../summarizers/subjective/alpacaeval.py | 6 ++- .../subjective/compass_arena_bradley_terry.py | 6 +-- .../summarizers/subjective/compassbench.py | 1 - opencompass/utils/lark.py | 4 +- 32 files changed, 112 insertions(+), 155 deletions(-) diff --git a/examples/eval_teval.py b/examples/eval_teval.py index 75312fc6..d786c03a 100644 --- a/examples/eval_teval.py +++ b/examples/eval_teval.py @@ -50,8 +50,9 @@ for m in _origin_models: datasets = teval_en_datasets + teval_zh_datasets work_dir = './outputs/teval' -''' -dataset version metric mode qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf +"""Dataset version metric mode +qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf. + ------------------------------------------- --------- -------------- ------- ----------------- ---------------------- -------------------- teval - naive_average unknown 57.69 78.18 36.63 teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27 @@ -77,4 +78,4 @@ teval-reason_retrieve_understand_json_v1_zh 10482d name unknown teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29 teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83 teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1 -''' +""" diff --git a/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py b/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py index 8063ee2d..7be1a235 100644 --- a/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py +++ b/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py @@ -22,14 +22,18 @@ humaneval_multi_eval_cfg = { pred_role='BOT', ) for lang in ['cpp', 'cs', 'd', 'go', 'java', 'jl', 'js', 'lua', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala', 'sh', 'swift', 'ts'] } +"""There are four versions of humaneval-{LANG}-{version}.jsonl: -'''there are four versions of humaneval-{LANG}-{version}.jsonl: -['keep', 'transform', 'reworded', 'remove'] -SRCDATA-LANG-keep is the same as SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original prompt had Python doctests, they remain as Python instead of being translated to LANG. If the original prompt had Python-specific terminology, e.g., 'list', it remains 'list', instead of being translated, e.g., to 'vector' for C++. -SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural language text of the prompt unchanged. -SRCDATA-LANG-reworded transforms both the doctests and the natural language text of the prompt to LANG. -SRCDATA-LANG-remove removes the doctests from the prompt. -''' +['keep', 'transform', 'reworded', 'remove'] SRCDATA-LANG-keep is the same as +SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original +prompt had Python doctests, they remain as Python instead of being translated +to LANG. If the original prompt had Python-specific terminology, e.g., 'list', +it remains 'list', instead of being translated, e.g., to 'vector' for C++. +SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural +language text of the prompt unchanged. SRCDATA-LANG-reworded transforms both +the doctests and the natural language text of the prompt to LANG. SRCDATA-LANG- +remove removes the doctests from the prompt. +""" humaneval_multi_datasets = [ dict( diff --git a/opencompass/configs/summarizers/chat_OC15.py b/opencompass/configs/summarizers/chat_OC15.py index 7a02e33e..c2e85263 100644 --- a/opencompass/configs/summarizers/chat_OC15.py +++ b/opencompass/configs/summarizers/chat_OC15.py @@ -53,22 +53,18 @@ summarizer = dict( ['sanitized_mbpp', 'score'], ['GPQA_diamond', 'accuracy'], ['IFEval', 'Prompt-level-strict-accuracy'], - '', - 'mmlu', 'mmlu-stem', 'mmlu-social-science', 'mmlu-humanities', 'mmlu-other', - 'cmmlu', 'cmmlu-stem', 'cmmlu-social-science', 'cmmlu-humanities', 'cmmlu-other', 'cmmlu-china-specific', - 'ceval', 'ceval-stem', 'ceval-social-science', diff --git a/opencompass/configs/summarizers/groups/calm.py b/opencompass/configs/summarizers/groups/calm.py index b497eef5..9f52aeea 100644 --- a/opencompass/configs/summarizers/groups/calm.py +++ b/opencompass/configs/summarizers/groups/calm.py @@ -157,7 +157,6 @@ summarizer = dict( 'Accuracy Average', 'English Average', 'Chinese Average', - '###### CALM-Lite Errors ######', 'Same response to all questions Average', 'Language inconsistency Average', diff --git a/opencompass/configs/summarizers/groups/mathbench_v1_2024.py b/opencompass/configs/summarizers/groups/mathbench_v1_2024.py index c34df7d7..5a77acc3 100644 --- a/opencompass/configs/summarizers/groups/mathbench_v1_2024.py +++ b/opencompass/configs/summarizers/groups/mathbench_v1_2024.py @@ -29,14 +29,12 @@ summarizer = dict( 'primary', 'arithmetic', 'mathbench-a (average)', - '###### MathBench-T: Theory Part ######', 'college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge', 'mathbench-t (average)', - '###### Overall: Average between MathBench-A and MathBench-T ######', 'Overall', ], diff --git a/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py b/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py index 0495b106..5c17c1cd 100644 --- a/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py +++ b/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py @@ -80,7 +80,6 @@ summarizer = dict( 'mathbench-a-middle-en', 'mathbench-a-primary-en', 'mathbench-a-en-average', - '#########################################################', '###### MathBench-T-CN: Theory Part (Chinese) ############', 'mathbench-t-college-cn', @@ -88,14 +87,12 @@ summarizer = dict( 'mathbench-t-middle-cn', 'mathbench-t-primary-cn', 'mathbench-t-cn-average', - '###### MathBench-T-EN: Theory Part (English) ############', 'mathbench-t-college-en', 'mathbench-t-high-en', 'mathbench-t-middle-en', 'mathbench-t-primary-en', 'mathbench-t-en-average', - '#########################################################', '###### MathBench-CN ############', 'college-cn', @@ -103,7 +100,6 @@ summarizer = dict( 'middle-cn', 'primary-cn', 'cn-avarage', - '###### MathBench-EN ############', 'college-en', 'high-en', diff --git a/opencompass/datasets/IFEval/instructions.py b/opencompass/datasets/IFEval/instructions.py index c1c05bc0..5287556e 100644 --- a/opencompass/datasets/IFEval/instructions.py +++ b/opencompass/datasets/IFEval/instructions.py @@ -320,14 +320,14 @@ class BulletListChecker(Instruction): def check_following(self, value): r"""Check if the number of bullet lists meets the requirement. - Args: - value: A string representing the response. The response is expected to - contain some bullet lists that start with `\*`. + Args: + value: A string representing the response. The response is expected to + contain some bullet lists that start with `\*`. - Returns: - True if the actual number of bullet lists in the response meets the - requirement. - """ + Returns: + True if the actual number of bullet lists in the response meets the + requirement. + """ bullet_lists = re.findall(r'^\s*\*[^\*].*$', value, flags=re.MULTILINE) bullet_lists_2 = re.findall(r'^\s*-.*$', value, flags=re.MULTILINE) num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) @@ -687,14 +687,14 @@ class RephraseChecker(Instruction): def check_following(self, value): r"""Checks if the rephrasing follows the instruction. - Args: - value: A string representing the response, which is expected to rephras - the string of `instruction_args`. + Args: + value: A string representing the response, which is expected to rephras + the string of `instruction_args`. - Returns: - True if `value` and `instruction_args` only differ by the words/sentences - in between two asterisks such as *change me*; otherwise, False. - """ + Returns: + True if `value` and `instruction_args` only differ by the words/sentences + in between two asterisks such as *change me*; otherwise, False. + """ if not self.is_change(value): raise ValueError(f'value {value} does not contain ' @@ -930,17 +930,17 @@ class ParagraphFirstWordCheck(Instruction): first_word=None): r"""Build the instruction description. - Args: - num_paragraphs: An integer indicating the number of paragraphs expected - in the response. A paragraph is a subset of the string that is - expected to be separated by '\n\n'. - nth_paragraph: An integer indicating the paragraph number that we look at. - Note that n starts from 1. - first_word: A string that represent the first word of the bth paragraph. + Args: + num_paragraphs: An integer indicating the number of paragraphs expected + in the response. A paragraph is a subset of the string that is + expected to be separated by '\n\n'. + nth_paragraph: An integer indicating the paragraph number that we look at. + Note that n starts from 1. + first_word: A string that represent the first word of the bth paragraph. - Returns: - A string representing the instruction description. - """ + Returns: + A string representing the instruction description. + """ self._num_paragraphs = num_paragraphs if self._num_paragraphs is None or self._num_paragraphs < 0: self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) diff --git a/opencompass/datasets/OlympiadBench.py b/opencompass/datasets/OlympiadBench.py index 6fda55a5..2b4cb984 100644 --- a/opencompass/datasets/OlympiadBench.py +++ b/opencompass/datasets/OlympiadBench.py @@ -422,11 +422,9 @@ class MathJudger: expression2: str, include_percentage: bool = True, ): - """ - (默认 expression1 为 Ground_Truth) - 函数: 判读两个数值是否在误差允许范围内相等 - 步骤1: 将可能出现的百分号的情况包含进来 - 步骤2: 使用 math.isclose 函数判断是否相等 + """(默认 expression1 为 Ground_Truth) 函数: 判读两个数值是否在误差允许范围内相等 步骤1: + + 将可能出现的百分号的情况包含进来 步骤2: 使用 math.isclose 函数判断是否相等. """ reference = float(expression1) prediction = float(expression2) @@ -442,12 +440,8 @@ class MathJudger: return False def expression_equal(self, exp1, exp2): - """ - (默认 expression1 为 Ground_Truth) - 函数: 判断两个表达式是否在数学意义上等价 - 步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1" - 步骤2: 使用 sympy 库进行等价判断 - """ + """(默认 expression1 为 Ground_Truth) 函数: 判断两个表达式是否在数学意义上等价 步骤1: 提取表达式, + 防止有的模型会给出"x=1"而不是"1" 步骤2: 使用 sympy 库进行等价判断.""" # 只提取等号右边的表达式 def extract_expression(expression): diff --git a/opencompass/datasets/PMMEval/mhellaswag.py b/opencompass/datasets/PMMEval/mhellaswag.py index 75e8a5bb..b23f3e2e 100755 --- a/opencompass/datasets/PMMEval/mhellaswag.py +++ b/opencompass/datasets/PMMEval/mhellaswag.py @@ -25,11 +25,7 @@ langs_dict = { def extract_choice(gen, lang): - r""" - { - "answer": "A|B|C|D" - } - """ + r"""{ "answer": "A|B|C|D" }""" patterns = [ r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", diff --git a/opencompass/datasets/PMMEval/mlogiqa.py b/opencompass/datasets/PMMEval/mlogiqa.py index 089759c6..23b7d9c9 100755 --- a/opencompass/datasets/PMMEval/mlogiqa.py +++ b/opencompass/datasets/PMMEval/mlogiqa.py @@ -25,11 +25,7 @@ langs_dict = { def extract_choice(gen, lang): - r""" - { - "answer": "A|B|C|D" - } - """ + r"""{ "answer": "A|B|C|D" }""" patterns = [ r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", diff --git a/opencompass/datasets/PMMEval/mmmlu.py b/opencompass/datasets/PMMEval/mmmlu.py index a71ab5c5..77ca561c 100755 --- a/opencompass/datasets/PMMEval/mmmlu.py +++ b/opencompass/datasets/PMMEval/mmmlu.py @@ -25,11 +25,7 @@ langs_dict = { def extract_choice(gen, lang): - r""" - { - "answer": "A|B|C|D" - } - """ + r"""{ "answer": "A|B|C|D" }""" patterns = [ r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", diff --git a/opencompass/datasets/PMMEval/xnli.py b/opencompass/datasets/PMMEval/xnli.py index 33afa411..68268e70 100755 --- a/opencompass/datasets/PMMEval/xnli.py +++ b/opencompass/datasets/PMMEval/xnli.py @@ -26,11 +26,7 @@ langs_dict = { def extract_choice(gen, lang): - r""" - { - "answer": "A|B|C|D" - } - """ + r"""{ "answer": "A|B|C|D" }""" patterns = [ r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}", r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}", diff --git a/opencompass/datasets/apps.py b/opencompass/datasets/apps.py index f8ea0ec2..59d336f0 100644 --- a/opencompass/datasets/apps.py +++ b/opencompass/datasets/apps.py @@ -342,7 +342,7 @@ class Capturing(list): def run_test(sample, test=None, debug=False): - """if test(generated_code) is not None it'll try to run the code. + """If test(generated_code) is not None it'll try to run the code. otherwise it'll just return an input and output pair. """ diff --git a/opencompass/datasets/livecodebench/execute_utils.py b/opencompass/datasets/livecodebench/execute_utils.py index c6af4e1c..dd84fae7 100644 --- a/opencompass/datasets/livecodebench/execute_utils.py +++ b/opencompass/datasets/livecodebench/execute_utils.py @@ -66,8 +66,8 @@ def codeexecute_check_correctness(check_program, timeout=3): """Evaluates the functional correctness of a completion by running the test suite provided in the problem. - :param completion_id: an optional completion ID so we can match - the results later even if execution finishes asynchronously. + :param completion_id: an optional completion ID so we can match the results + later even if execution finishes asynchronously. """ manager = multiprocessing.Manager() result = manager.list() diff --git a/opencompass/datasets/livecodebench/prompts.py b/opencompass/datasets/livecodebench/prompts.py index 406ff1a3..4fa817b7 100644 --- a/opencompass/datasets/livecodebench/prompts.py +++ b/opencompass/datasets/livecodebench/prompts.py @@ -135,7 +135,7 @@ def get_generic_question_template_test_completion(question_content, testcase_input: str): def format_testcase_func_name_input(function_name, testcase): - """use the form of "assert func_name(input) == ".""" + """Use the form of "assert func_name(input) == ".""" # TODO should there be a space after the == ? input_str = ', '.join(testcase.split('\n')) return f'assert {function_name}({input_str}) == # TODO' diff --git a/opencompass/datasets/livecodebench/testing_util.py b/opencompass/datasets/livecodebench/testing_util.py index 6583b17b..8ce38adf 100644 --- a/opencompass/datasets/livecodebench/testing_util.py +++ b/opencompass/datasets/livecodebench/testing_util.py @@ -79,7 +79,7 @@ def combined_int_check(val): def run_test(sample, test=None, debug=False, timeout=6): - """if test(generated_code) is not None it'll try to run the code. + """If test(generated_code) is not None it'll try to run the code. otherwise it'll just return an input and output pair. """ diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index 674f1b39..9001833d 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -512,7 +512,7 @@ class MATHEvaluator(BaseEvaluator): @ICL_EVALUATORS.register_module() class MATHAgentEvaluator(MATHEvaluator): - """math agent evaluator for soft condition. + """Math agent evaluator for soft condition. Args: action (str): Action for catching internal prediction. diff --git a/opencompass/datasets/musr/tree.py b/opencompass/datasets/musr/tree.py index 5d80618f..0733f0a7 100644 --- a/opencompass/datasets/musr/tree.py +++ b/opencompass/datasets/musr/tree.py @@ -38,7 +38,8 @@ class LogicNodeFactType: class LogicNodeConstraints: - """Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though.""" + """Useful for things like children = ['X is the murderer', 'Y is the + murderer', 'Z is the murderer'], we no longer use this structure though.""" ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true' @@ -244,8 +245,10 @@ class LogicTree: explicit leaf nodes. :param include_cs: Include the commonsense nodes from all levels. - :param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper. - :param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes. + :param include_deductions_from_level: Include any intermediate + deduction nodes from the specified level and deeper. + :param no_facts_after_depth: Essentially tree the deductions at the + specified depth as leaf nodes. """ def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]: @@ -302,16 +305,21 @@ class LogicTree: However, more complex arguments can be used to control what is printed. - This returns a string that must be printed (don't be confused by the method name.) + This returns a string that must be printed (don't be confused by the + method name.) :param node: Start at a specific node. - :param level: Controls how much tabbing is done when printing the current node. - :param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example) + :param level: Controls how much tabbing is done when printing the + current node. + :param pad_char: Char to use that specifies depth ('> ' at depth 3 will + look like '> > > ' if you have pad_space equal to 1 for example) :param pad_space: How many spaces to include between pad_chars :param print_forward: Print the tree with parent nodes first. - :param print_conjection_types: Print the Ands and Ors per deduction (not used) + :param print_conjection_types: Print the Ands and Ors per deduction + (not used) :param print_reasoning_types: Print the deduction types (not used) - :param ignore_value_after_depth: Ignore content of the nodes once a depth is met + :param ignore_value_after_depth: Ignore content of the nodes once a + depth is met :param print_only_nodes_with_value: Ignore nodes without content. """ diff --git a/opencompass/datasets/subjective/alpacaeval.py b/opencompass/datasets/subjective/alpacaeval.py index 30e72bdd..67a49478 100644 --- a/opencompass/datasets/subjective/alpacaeval.py +++ b/opencompass/datasets/subjective/alpacaeval.py @@ -42,7 +42,8 @@ class AlpacaEvalDataset(BaseDataset): def post_process_alpacav2(completion: str): - r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1. + r"""Parse a completion that contains 'm' or 'M' and returns the rank of the + model1. Examples -------- diff --git a/opencompass/datasets/taco.py b/opencompass/datasets/taco.py index e48cd260..a08f33f0 100644 --- a/opencompass/datasets/taco.py +++ b/opencompass/datasets/taco.py @@ -292,7 +292,7 @@ class Capturing(list): def run_test(sample, test=None, debug=False): - """if test(generated_code) is not None it'll try to run the code. + """If test(generated_code) is not None it'll try to run the code. otherwise it'll just return an input and output pair. """ diff --git a/opencompass/models/baidu_api.py b/opencompass/models/baidu_api.py index f1a126ef..9becf018 100644 --- a/opencompass/models/baidu_api.py +++ b/opencompass/models/baidu_api.py @@ -128,18 +128,11 @@ class ERNIEBot(BaseAPIModel): str: The generated string. """ assert isinstance(input, (str, PromptList)) - """ - { - "messages": [ - {"role":"user","content":"请介绍一下你自己"}, - {"role":"assistant","content":"我是百度公司开发的人工智能语言模型"}, - {"role":"user","content": "我在上海,周末可以去哪里玩?"}, - {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"}, - {"role":"user","content": "周末这里的天气怎么样?"} - ] - } - - """ + """{ "messages": [ {"role":"user","content":"请介绍一下你自己"}, + {"role":"assistant","content":"我是百度公司开发的人工智能语言模型"}, + {"role":"user","content": "我在上海,周末可以去哪里玩?"}, + {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"}, + {"role":"user","content": "周末这里的天气怎么样?"} ] }""" if isinstance(input, str): messages = [{'role': 'user', 'content': input}] diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py index d22c0785..36648383 100644 --- a/opencompass/models/qwen_api.py +++ b/opencompass/models/qwen_api.py @@ -86,17 +86,11 @@ class Qwen(BaseAPIModel): str: The generated string. """ assert isinstance(input, (str, PromptList)) - """ - { - "messages": [ - {"role":"user","content":"请介绍一下你自己"}, - {"role":"assistant","content":"我是通义千问"}, - {"role":"user","content": "我在上海,周末可以去哪里玩?"}, - {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"}, - {"role":"user","content": "周末这里的天气怎么样?"} - ] - } + """{ "messages": [ {"role":"user","content":"请介绍一下你自己"}, + {"role":"assistant","content":"我是通义千问"}, {"role":"user","content": + "我在上海,周末可以去哪里玩?"}, {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"}, + {"role":"user","content": "周末这里的天气怎么样?"} ] } """ if isinstance(input, str): diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index 92cd0950..c8c777f2 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -13,7 +13,7 @@ PromptType = Union[PromptList, str] def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" + """Decode text according to its encoding type.""" invalid_chars = [b'\xef\xbf\xbd'] bstr = bytes(string, coding) for invalid_char in invalid_chars: diff --git a/opencompass/models/turbomind_api.py b/opencompass/models/turbomind_api.py index a7dc0f85..32d9aec4 100644 --- a/opencompass/models/turbomind_api.py +++ b/opencompass/models/turbomind_api.py @@ -10,7 +10,7 @@ PromptType = Union[PromptList, str] def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" + """Decode text according to its encoding type.""" invalid_chars = [b'\xef\xbf\xbd'] bstr = bytes(string, coding) for invalid_char in invalid_chars: diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index cbf14263..1d481aea 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -16,7 +16,7 @@ PromptType = Union[PromptList, str] def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" + """Decode text according to its encoding type.""" invalid_chars = [b'\xef\xbf\xbd'] bstr = bytes(string, coding) for invalid_char in invalid_chars: diff --git a/opencompass/models/yayi_api.py b/opencompass/models/yayi_api.py index 73ba3ea0..1b2a5ecb 100644 --- a/opencompass/models/yayi_api.py +++ b/opencompass/models/yayi_api.py @@ -40,10 +40,7 @@ def get_current_time(format='%Y-%m-%d %H:%M:%S'): def get_current_timestamp(): - """ - 获取当前时间时间戳 - :return: - """ + """获取当前时间时间戳 :return:""" timestamp_str = int(round(time.time() * 1000)) return str(timestamp_str) @@ -59,10 +56,7 @@ def encode_base64_string(s): def get_current_time_gmt_format(): - """ - 获取当前时间的GMT 时间 - :return: - """ + """获取当前时间的GMT 时间 :return:""" GMT_FORMAT = '%a, %d %b %Y %H:%M:%SGMT+00:00' now = datetime.now() time_str = now.strftime(GMT_FORMAT) diff --git a/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py b/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py index 4e86789d..0e12de9e 100644 --- a/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py @@ -12,9 +12,9 @@ from .icl_base_evaluator import BaseEvaluator class AUCROCEvaluator(BaseEvaluator): """Calculate AUC-ROC scores and accuracy according the prediction. - For some dataset, the accuracy cannot reveal the difference between - models because of the saturation. AUC-ROC scores can further exam - model abilities to distinguish different labels. More details can refer to + For some dataset, the accuracy cannot reveal the difference between models + because of the saturation. AUC-ROC scores can further exam model abilities + to distinguish different labels. More details can refer to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html """ # noqa diff --git a/opencompass/summarizers/multi_model.py b/opencompass/summarizers/multi_model.py index 45d159d5..c5a0d64f 100644 --- a/opencompass/summarizers/multi_model.py +++ b/opencompass/summarizers/multi_model.py @@ -283,25 +283,19 @@ class MultiModelSummarizer: self.models_summary_group_metrics[new_model_name] = summarizer.models_summary_group_metrics[new_model_name] def summarize(self): - """ - Format in self.table - [ - ['dataset', 'version', 'metric', 'mode', 'model_name'], - ['--------- 考试 Exam ---------', '-', '-', '-', '-'], - ['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'], - ['ARC-e', '1e0de5', 'accuracy', 'gen', '85.36'], - ['--------- 语言 Language ---------', '-', '-', '-', '-'], - ['WiC', 'd06864', 'accuracy', 'gen', '55.64'], - ['chid-dev', '211ee7', 'accuracy', 'gen', '52.97'], - ['--------- 知识 Knowledge ---------', '-', '-', '-', '-'], - ['BoolQ', '883d50', 'accuracy', 'gen', '86.06'], - ['--------- 理解 Understanding ---------', '-', '-', '-', '-'], - ['C3', '8c358f', 'accuracy', 'gen', '88.33'], - ['race-middle', '9a54b6', 'accuracy', 'gen', '90.32'], - ['--------- 推理 Reasoning ---------', '-', '-', '-', '-'], - ['cmnli', '1abf97', 'accuracy', 'gen', '38.26'], - ['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'], - ] + """Format in self.table [ ['dataset', 'version', 'metric', 'mode', + 'model_name'], ['--------- 考试 Exam ---------', '-', '-', '-', '-'], + ['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'], ['ARC-e', '1e0de5', + 'accuracy', 'gen', '85.36'], ['--------- 语言 Language ---------', '-', + '-', '-', '-'], ['WiC', 'd06864', 'accuracy', 'gen', '55.64'], ['chid- + + dev', '211ee7', 'accuracy', 'gen', '52.97'], ['--------- 知识 Knowledge + ---------', '-', '-', '-', '-'], ['BoolQ', '883d50', 'accuracy', 'gen', + '86.06'], ['--------- 理解 Understanding ---------', '-', '-', '-', '-'], + ['C3', '8c358f', 'accuracy', 'gen', '88.33'], ['race-middle', '9a54b6', + 'accuracy', 'gen', '90.32'], ['--------- 推理 Reasoning ---------', '-', + '-', '-', '-'], ['cmnli', '1abf97', 'accuracy', 'gen', '38.26'], + ['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'], ] """ table = Table() diff --git a/opencompass/summarizers/subjective/alpacaeval.py b/opencompass/summarizers/subjective/alpacaeval.py index 528aa03e..81ab8394 100644 --- a/opencompass/summarizers/subjective/alpacaeval.py +++ b/opencompass/summarizers/subjective/alpacaeval.py @@ -19,7 +19,8 @@ from .utils import get_judgeanswer_and_reference, get_outdir def post_process_alpacav1(completion: str): - r"""Parse a completion that contains a list of dictionary and returns the rank of the model1. + r"""Parse a completion that contains a list of dictionary and returns the + rank of the model1. Examples -------- @@ -47,7 +48,8 @@ def post_process_alpacav1(completion: str): def post_process_alpacav2(completion: str): - r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1. + r"""Parse a completion that contains 'm' or 'M' and returns the rank of the + model1. Examples -------- diff --git a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py index 3ae67e8c..21d2fd01 100644 --- a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py +++ b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py @@ -61,7 +61,7 @@ def preprocess_for_elo(df): def preprocess_for_bt(df): - """in BT we only need the unique (matchup,outcome) sets along with the + """In BT we only need the unique (matchup,outcome) sets along with the weights of how often they occur.""" n_rows = len(df) # the 3 columns of schedule represent: model_a id, model_b id, outcome_id @@ -179,7 +179,7 @@ def fit_vectorized_elo( init_rating: float = 1000.0, scale: float = 400.0, ): - """fit multiple sets of Elo ratings on different samples of the data at the + """Fit multiple sets of Elo ratings on different samples of the data at the same time.""" alpha = math.log(base) / scale num_samples = sample_indices.shape[1] @@ -282,7 +282,7 @@ def scale_and_offset( baseline_model: str = None, baseline_rating: float = 1000.0, ): - """convert ratings from the natural scale to the Elo rating scale with an + """Convert ratings from the natural scale to the Elo rating scale with an anchored baseline.""" scaled_ratings = (ratings * scale) + init_rating diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py index 7ed1ee53..3646994f 100644 --- a/opencompass/summarizers/subjective/compassbench.py +++ b/opencompass/summarizers/subjective/compassbench.py @@ -50,7 +50,6 @@ MAP = { 'language/compassbench_v2501_language_en_nlp_sub', 'language/compassbench_v2501_language_en_creation_sub', ], - 'code': [ '总分', '中文总分', diff --git a/opencompass/utils/lark.py b/opencompass/utils/lark.py index ad5e9904..8b2abb21 100644 --- a/opencompass/utils/lark.py +++ b/opencompass/utils/lark.py @@ -15,8 +15,8 @@ class LarkReporter: title: Optional[str] = None): """Post a message to Lark. - When title is None, message must be a str. - otherwise msg can be in rich text format (see + When title is None, message must be a str. otherwise msg can be in rich + text format (see https://open.feishu.cn/document/uAjLw4CM/ukTMukTMukTM/im-v1/message/create_json#45e0953e for details). """