fix lint

2025-05-30 16:03:24 +08:00 · 2025-05-27 08:39:17 +00:00 · 2025-05-27 08:39:17 +00:00 · f92dca7444
commit f92dca7444
parent f93668337a
32 changed files with 112 additions and 155 deletions
--- a/examples/eval_teval.py
+++ b/examples/eval_teval.py
@ -50,8 +50,9 @@ for m in _origin_models:

 datasets = teval_en_datasets + teval_zh_datasets
 work_dir = './outputs/teval'
-'''
-dataset                                      version    metric          mode       qwen-7b-chat-hf    internlm2-chat-7b-hf    llama-2-7b-chat-hf
+"""Dataset                                      version    metric          mode
+qwen-7b-chat-hf    internlm2-chat-7b-hf    llama-2-7b-chat-hf.
+
 -------------------------------------------  ---------  --------------  -------  -----------------  ----------------------  --------------------
 teval                                        -          naive_average   unknown              57.69                   78.18                 36.63
 teval-instruct_v1                            10482d     string_metric   unknown              28.83                   98.08                 50.27
@ -77,4 +78,4 @@ teval-reason_retrieve_understand_json_v1_zh  10482d     name            unknown
 teval-understand_str_v1_zh                   10482d     args            unknown              84.39                   88.62                 77.29
 teval-reason_retrieve_understand_json_v1_zh  10482d     args            unknown              48.71                   72.71                 28.83
 teval-review_str_v1_zh                       10482d     review_quality  unknown              56.67                   60.57                 27.1
-'''
+"""
--- a/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py
+++ b/opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen_82cf85.py
@ -22,14 +22,18 @@ humaneval_multi_eval_cfg = {
        pred_role='BOT',
    ) for lang in ['cpp', 'cs', 'd', 'go', 'java', 'jl', 'js', 'lua', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala', 'sh', 'swift', 'ts']
 }
+"""There are four versions of humaneval-{LANG}-{version}.jsonl:

-'''there are four versions of humaneval-{LANG}-{version}.jsonl:
-['keep', 'transform', 'reworded', 'remove']
-SRCDATA-LANG-keep is the same as SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original prompt had Python doctests, they remain as Python instead of being translated to LANG. If the original prompt had Python-specific terminology, e.g., 'list', it remains 'list', instead of being translated, e.g., to 'vector' for C++.
-SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural language text of the prompt unchanged.
-SRCDATA-LANG-reworded transforms both the doctests and the natural language text of the prompt to LANG.
-SRCDATA-LANG-remove removes the doctests from the prompt.
-'''
+['keep', 'transform', 'reworded', 'remove'] SRCDATA-LANG-keep is the same as
+SRCDATA-LANG, but the text of the prompt is totally unchanged. If the original
+prompt had Python doctests, they remain as Python instead of being translated
+to LANG. If the original prompt had Python-specific terminology, e.g., 'list',
+it remains 'list', instead of being translated, e.g., to 'vector' for C++.
+SRCDATA-LANG-transform transforms the doctests to LANG but leaves the natural
+language text of the prompt unchanged. SRCDATA-LANG-reworded transforms both
+the doctests and the natural language text of the prompt to LANG. SRCDATA-LANG-
+remove removes the doctests from the prompt.
+"""

 humaneval_multi_datasets = [
    dict(
--- a/opencompass/configs/summarizers/chat_OC15.py
+++ b/opencompass/configs/summarizers/chat_OC15.py
@ -53,22 +53,18 @@ summarizer = dict(
        ['sanitized_mbpp', 'score'],
        ['GPQA_diamond', 'accuracy'],
        ['IFEval', 'Prompt-level-strict-accuracy'],
-
        '',
-
        'mmlu',
        'mmlu-stem',
        'mmlu-social-science',
        'mmlu-humanities',
        'mmlu-other',
-
        'cmmlu',
        'cmmlu-stem',
        'cmmlu-social-science',
        'cmmlu-humanities',
        'cmmlu-other',
        'cmmlu-china-specific',
-
        'ceval',
        'ceval-stem',
        'ceval-social-science',
--- a/opencompass/configs/summarizers/groups/calm.py
+++ b/opencompass/configs/summarizers/groups/calm.py
@ -157,7 +157,6 @@ summarizer = dict(
        'Accuracy Average',
        'English Average',
        'Chinese Average',
-
        '###### CALM-Lite Errors ######',
        'Same response to all questions Average',
        'Language inconsistency Average',
--- a/opencompass/configs/summarizers/groups/mathbench_v1_2024.py
+++ b/opencompass/configs/summarizers/groups/mathbench_v1_2024.py
@ -29,14 +29,12 @@ summarizer = dict(
        'primary',
        'arithmetic',
        'mathbench-a (average)',
-
        '###### MathBench-T: Theory Part ######',
        'college_knowledge',
        'high_knowledge',
        'middle_knowledge',
        'primary_knowledge',
        'mathbench-t (average)',
-
        '###### Overall: Average between MathBench-A and MathBench-T ######',
        'Overall',
    ],
--- a/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py
+++ b/opencompass/configs/summarizers/groups/mathbench_v1_2024_lang.py
@ -80,7 +80,6 @@ summarizer = dict(
        'mathbench-a-middle-en',
        'mathbench-a-primary-en',
        'mathbench-a-en-average',
-
        '#########################################################',
        '###### MathBench-T-CN: Theory Part (Chinese) ############',
        'mathbench-t-college-cn',
@ -88,14 +87,12 @@ summarizer = dict(
        'mathbench-t-middle-cn',
        'mathbench-t-primary-cn',
        'mathbench-t-cn-average',
-
        '###### MathBench-T-EN: Theory Part (English) ############',
        'mathbench-t-college-en',
        'mathbench-t-high-en',
        'mathbench-t-middle-en',
        'mathbench-t-primary-en',
        'mathbench-t-en-average',
-
        '#########################################################',
        '###### MathBench-CN ############',
        'college-cn',
@ -103,7 +100,6 @@ summarizer = dict(
        'middle-cn',
        'primary-cn',
        'cn-avarage',
-
        '###### MathBench-EN ############',
        'college-en',
        'high-en',
--- a/opencompass/datasets/IFEval/instructions.py
+++ b/opencompass/datasets/IFEval/instructions.py
@ -320,14 +320,14 @@ class BulletListChecker(Instruction):
    def check_following(self, value):
        r"""Check if the number of bullet lists meets the requirement.

-    Args:
-      value: A string representing the response. The response is expected to
-        contain some bullet lists that start with `\*`.
+        Args:
+          value: A string representing the response. The response is expected to
+            contain some bullet lists that start with `\*`.

-    Returns:
-      True if the actual number of bullet lists in the response meets the
-      requirement.
-    """
+        Returns:
+          True if the actual number of bullet lists in the response meets the
+          requirement.
+        """
        bullet_lists = re.findall(r'^\s*\*[^\*].*$', value, flags=re.MULTILINE)
        bullet_lists_2 = re.findall(r'^\s*-.*$', value, flags=re.MULTILINE)
        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
@ -687,14 +687,14 @@ class RephraseChecker(Instruction):
    def check_following(self, value):
        r"""Checks if the rephrasing follows the instruction.

-    Args:
-      value: A string representing the response, which is expected to rephras
-        the string of `instruction_args`.
+        Args:
+          value: A string representing the response, which is expected to rephras
+            the string of `instruction_args`.

-    Returns:
-      True if `value` and `instruction_args` only differ by the words/sentences
-      in between two asterisks such as *change me*; otherwise, False.
-    """
+        Returns:
+          True if `value` and `instruction_args` only differ by the words/sentences
+          in between two asterisks such as *change me*; otherwise, False.
+        """

        if not self.is_change(value):
            raise ValueError(f'value {value} does not contain '
@ -930,17 +930,17 @@ class ParagraphFirstWordCheck(Instruction):
                          first_word=None):
        r"""Build the instruction description.

-    Args:
-      num_paragraphs: An integer indicating the number of paragraphs expected
-        in the response. A paragraph is a subset of the string that is
-        expected to be separated by '\n\n'.
-      nth_paragraph: An integer indicating the paragraph number that we look at.
-        Note that n starts from 1.
-      first_word: A string that represent the first word of the bth paragraph.
+        Args:
+          num_paragraphs: An integer indicating the number of paragraphs expected
+            in the response. A paragraph is a subset of the string that is
+            expected to be separated by '\n\n'.
+          nth_paragraph: An integer indicating the paragraph number that we look at.
+            Note that n starts from 1.
+          first_word: A string that represent the first word of the bth paragraph.

-    Returns:
-      A string representing the instruction description.
-    """
+        Returns:
+          A string representing the instruction description.
+        """
        self._num_paragraphs = num_paragraphs
        if self._num_paragraphs is None or self._num_paragraphs < 0:
            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
--- a/opencompass/datasets/OlympiadBench.py
+++ b/opencompass/datasets/OlympiadBench.py
@ -422,11 +422,9 @@ class MathJudger:
        expression2: str,
        include_percentage: bool = True,
    ):
-        """
-        (默认 expression1 为 Ground_Truth)
-        函数: 判读两个数值是否在误差允许范围内相等
-        步骤1: 将可能出现的百分号的情况包含进来
-        步骤2: 使用 math.isclose 函数判断是否相等
+        """(默认 expression1 为 Ground_Truth) 函数: 判读两个数值是否在误差允许范围内相等 步骤1:
+
+        将可能出现的百分号的情况包含进来 步骤2: 使用 math.isclose 函数判断是否相等.
        """
        reference = float(expression1)
        prediction = float(expression2)
@ -442,12 +440,8 @@ class MathJudger:
        return False

    def expression_equal(self, exp1, exp2):
-        """
-        (默认 expression1 为 Ground_Truth)
-        函数: 判断两个表达式是否在数学意义上等价
-        步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
-        步骤2: 使用 sympy 库进行等价判断
-        """
+        """(默认 expression1 为 Ground_Truth) 函数: 判断两个表达式是否在数学意义上等价 步骤1: 提取表达式,
+        防止有的模型会给出"x=1"而不是"1" 步骤2: 使用 sympy 库进行等价判断."""

        # 只提取等号右边的表达式
        def extract_expression(expression):
--- a/opencompass/datasets/PMMEval/mhellaswag.py
+++ b/opencompass/datasets/PMMEval/mhellaswag.py
@ -25,11 +25,7 @@ langs_dict = {


 def extract_choice(gen, lang):
-    r"""
-    {
-        "answer": "A|B|C|D"
-    }
-    """
+    r"""{ "answer": "A|B|C|D" }"""
    patterns = [
        r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
        r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
--- a/opencompass/datasets/PMMEval/mlogiqa.py
+++ b/opencompass/datasets/PMMEval/mlogiqa.py
@ -25,11 +25,7 @@ langs_dict = {


 def extract_choice(gen, lang):
-    r"""
-    {
-        "answer": "A|B|C|D"
-    }
-    """
+    r"""{ "answer": "A|B|C|D" }"""
    patterns = [
        r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
        r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
--- a/opencompass/datasets/PMMEval/mmmlu.py
+++ b/opencompass/datasets/PMMEval/mmmlu.py
@ -25,11 +25,7 @@ langs_dict = {


 def extract_choice(gen, lang):
-    r"""
-    {
-        "answer": "A|B|C|D"
-    }
-    """
+    r"""{ "answer": "A|B|C|D" }"""
    patterns = [
        r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
        r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
--- a/opencompass/datasets/PMMEval/xnli.py
+++ b/opencompass/datasets/PMMEval/xnli.py
@ -26,11 +26,7 @@ langs_dict = {


 def extract_choice(gen, lang):
-    r"""
-    {
-        "answer": "A|B|C|D"
-    }
-    """
+    r"""{ "answer": "A|B|C|D" }"""
    patterns = [
        r"\{\s*?\"answer\"\s*?\:\s*?\"?(A|B|C|D).*?\"?\s*?\}",
        r"\{\s*?[\'\"]answer[\'\"]\s*?\:\s*?[\'\"](A|B|C|D).*?[\'\"]\s*?\}",
--- a/opencompass/datasets/apps.py
+++ b/opencompass/datasets/apps.py
@ -342,7 +342,7 @@ class Capturing(list):


 def run_test(sample, test=None, debug=False):
-    """if test(generated_code) is not None it'll try to run the code.
+    """If test(generated_code) is not None it'll try to run the code.

    otherwise it'll just return an input and output pair.
    """
--- a/opencompass/datasets/livecodebench/execute_utils.py
+++ b/opencompass/datasets/livecodebench/execute_utils.py
@ -66,8 +66,8 @@ def codeexecute_check_correctness(check_program, timeout=3):
    """Evaluates the functional correctness of a completion by running the test
    suite provided in the problem.

-    :param completion_id: an optional completion ID so we can match
-        the results later even if execution finishes asynchronously.
+    :param completion_id: an optional completion ID so we can match the results
+        later even if execution finishes asynchronously.
    """
    manager = multiprocessing.Manager()
    result = manager.list()
--- a/opencompass/datasets/livecodebench/prompts.py
+++ b/opencompass/datasets/livecodebench/prompts.py
@ -135,7 +135,7 @@ def get_generic_question_template_test_completion(question_content,
                                                  testcase_input: str):

    def format_testcase_func_name_input(function_name, testcase):
-        """use the form of "assert func_name(input) == "."""
+        """Use the form of "assert func_name(input) == "."""
        # TODO should there be a space after the == ?
        input_str = ', '.join(testcase.split('\n'))
        return f'assert {function_name}({input_str}) == # TODO'
--- a/opencompass/datasets/livecodebench/testing_util.py
+++ b/opencompass/datasets/livecodebench/testing_util.py
@ -79,7 +79,7 @@ def combined_int_check(val):


 def run_test(sample, test=None, debug=False, timeout=6):
-    """if test(generated_code) is not None it'll try to run the code.
+    """If test(generated_code) is not None it'll try to run the code.

    otherwise it'll just return an input and output pair.
    """
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@ -512,7 +512,7 @@ class MATHEvaluator(BaseEvaluator):

@ICL_EVALUATORS.register_module()
 class MATHAgentEvaluator(MATHEvaluator):
-    """math agent evaluator for soft condition.
+    """Math agent evaluator for soft condition.

    Args:
        action (str): Action for catching internal prediction.
--- a/opencompass/datasets/musr/tree.py
+++ b/opencompass/datasets/musr/tree.py
@ -38,7 +38,8 @@ class LogicNodeFactType:


 class LogicNodeConstraints:
-    """Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though."""
+    """Useful for things like children = ['X is the murderer', 'Y is the
+    murderer', 'Z is the murderer'], we no longer use this structure though."""
    ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true'


@ -244,8 +245,10 @@ class LogicTree:
        explicit leaf nodes.

        :param include_cs: Include the commonsense nodes from all levels.
-        :param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper.
-        :param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes.
+        :param include_deductions_from_level: Include any intermediate
+            deduction nodes from the specified level and deeper.
+        :param no_facts_after_depth: Essentially tree the deductions at the
+            specified depth as leaf nodes.
        """

        def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]:
@ -302,16 +305,21 @@ class LogicTree:

        However, more complex arguments can be used to control what is printed.

-        This returns a string that must be printed (don't be confused by the method name.)
+        This returns a string that must be printed (don't be confused by the
+        method name.)

        :param node: Start at a specific node.
-        :param level: Controls how much tabbing is done when printing the current node.
-        :param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example)
+        :param level: Controls how much tabbing is done when printing the
+            current node.
+        :param pad_char: Char to use that specifies depth ('> ' at depth 3 will
+            look like '> > > ' if you have pad_space equal to 1 for example)
        :param pad_space: How many spaces to include between pad_chars
        :param print_forward: Print the tree with parent nodes first.
-        :param print_conjection_types: Print the Ands and Ors per deduction (not used)
+        :param print_conjection_types: Print the Ands and Ors per deduction
+            (not used)
        :param print_reasoning_types: Print the deduction types (not used)
-        :param ignore_value_after_depth: Ignore content of the nodes once a depth is met
+        :param ignore_value_after_depth: Ignore content of the nodes once a
+            depth is met
        :param print_only_nodes_with_value: Ignore nodes without content.
        """

--- a/opencompass/datasets/subjective/alpacaeval.py
+++ b/opencompass/datasets/subjective/alpacaeval.py
@ -42,7 +42,8 @@ class AlpacaEvalDataset(BaseDataset):


 def post_process_alpacav2(completion: str):
-    r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1.
+    r"""Parse a completion that contains 'm' or 'M' and returns the rank of the
+    model1.

    Examples
    --------
--- a/opencompass/datasets/taco.py
+++ b/opencompass/datasets/taco.py
@ -292,7 +292,7 @@ class Capturing(list):


 def run_test(sample, test=None, debug=False):
-    """if test(generated_code) is not None it'll try to run the code.
+    """If test(generated_code) is not None it'll try to run the code.

    otherwise it'll just return an input and output pair.
    """
--- a/opencompass/models/baidu_api.py
+++ b/opencompass/models/baidu_api.py
@ -128,18 +128,11 @@ class ERNIEBot(BaseAPIModel):
            str: The generated string.
        """
        assert isinstance(input, (str, PromptList))
-        """
-        {
-          "messages": [
-            {"role":"user","content":"请介绍一下你自己"},
-            {"role":"assistant","content":"我是百度公司开发的人工智能语言模型"},
-            {"role":"user","content": "我在上海，周末可以去哪里玩？"},
-            {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
-            {"role":"user","content": "周末这里的天气怎么样？"}
-          ]
-        }
-
-        """
+        """{ "messages": [ {"role":"user","content":"请介绍一下你自己"},
+        {"role":"assistant","content":"我是百度公司开发的人工智能语言模型"},
+        {"role":"user","content": "我在上海，周末可以去哪里玩？"},
+        {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
+        {"role":"user","content": "周末这里的天气怎么样？"} ] }"""

        if isinstance(input, str):
            messages = [{'role': 'user', 'content': input}]
--- a/opencompass/models/qwen_api.py
+++ b/opencompass/models/qwen_api.py
@ -86,17 +86,11 @@ class Qwen(BaseAPIModel):
            str: The generated string.
        """
        assert isinstance(input, (str, PromptList))
-        """
-        {
-          "messages": [
-            {"role":"user","content":"请介绍一下你自己"},
-            {"role":"assistant","content":"我是通义千问"},
-            {"role":"user","content": "我在上海，周末可以去哪里玩？"},
-            {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
-            {"role":"user","content": "周末这里的天气怎么样？"}
-          ]
-        }
+        """{ "messages": [ {"role":"user","content":"请介绍一下你自己"},
+        {"role":"assistant","content":"我是通义千问"}, {"role":"user","content":

+        "我在上海，周末可以去哪里玩？"}, {"role":"assistant","content": "上海是一个充满活力和文化氛围的城市"},
+        {"role":"user","content": "周末这里的天气怎么样？"} ] }
        """

        if isinstance(input, str):
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@ -13,7 +13,7 @@ PromptType = Union[PromptList, str]


 def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
+    """Decode text according to its encoding type."""
    invalid_chars = [b'\xef\xbf\xbd']
    bstr = bytes(string, coding)
    for invalid_char in invalid_chars:
--- a/opencompass/models/turbomind_api.py
+++ b/opencompass/models/turbomind_api.py
@ -10,7 +10,7 @@ PromptType = Union[PromptList, str]


 def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
+    """Decode text according to its encoding type."""
    invalid_chars = [b'\xef\xbf\xbd']
    bstr = bytes(string, coding)
    for invalid_char in invalid_chars:
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@ -16,7 +16,7 @@ PromptType = Union[PromptList, str]


 def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
+    """Decode text according to its encoding type."""
    invalid_chars = [b'\xef\xbf\xbd']
    bstr = bytes(string, coding)
    for invalid_char in invalid_chars:
--- a/opencompass/models/yayi_api.py
+++ b/opencompass/models/yayi_api.py
@ -40,10 +40,7 @@ def get_current_time(format='%Y-%m-%d %H:%M:%S'):


 def get_current_timestamp():
-    """
-    获取当前时间时间戳
-    :return:
-    """
+    """获取当前时间时间戳 :return:"""
    timestamp_str = int(round(time.time() * 1000))
    return str(timestamp_str)

@ -59,10 +56,7 @@ def encode_base64_string(s):


 def get_current_time_gmt_format():
-    """
-    获取当前时间的GMT 时间
-    :return:
-    """
+    """获取当前时间的GMT 时间 :return:"""
    GMT_FORMAT = '%a, %d %b %Y %H:%M:%SGMT+00:00'
    now = datetime.now()
    time_str = now.strftime(GMT_FORMAT)
--- a/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py
@ -12,9 +12,9 @@ from .icl_base_evaluator import BaseEvaluator
 class AUCROCEvaluator(BaseEvaluator):
    """Calculate AUC-ROC scores and accuracy according the prediction.

-    For some dataset, the accuracy cannot reveal the difference between
-    models because of the saturation. AUC-ROC scores can further exam
-    model abilities to distinguish different labels. More details can refer to
+    For some dataset, the accuracy cannot reveal the difference between models
+    because of the saturation. AUC-ROC scores can further exam model abilities
+    to distinguish different labels. More details can refer to
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
    """  # noqa

--- a/opencompass/summarizers/multi_model.py
+++ b/opencompass/summarizers/multi_model.py
@ -283,25 +283,19 @@ class MultiModelSummarizer:
        self.models_summary_group_metrics[new_model_name] = summarizer.models_summary_group_metrics[new_model_name]

    def summarize(self):
-        """
-        Format in self.table
-        [
-            ['dataset', 'version', 'metric', 'mode', 'model_name'],
-            ['--------- 考试 Exam ---------', '-', '-', '-', '-'],
-            ['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'],
-            ['ARC-e', '1e0de5', 'accuracy', 'gen', '85.36'],
-            ['--------- 语言 Language ---------', '-', '-', '-', '-'],
-            ['WiC', 'd06864', 'accuracy', 'gen', '55.64'],
-            ['chid-dev', '211ee7', 'accuracy', 'gen', '52.97'],
-            ['--------- 知识 Knowledge ---------', '-', '-', '-', '-'],
-            ['BoolQ', '883d50', 'accuracy', 'gen', '86.06'],
-            ['--------- 理解 Understanding ---------', '-', '-', '-', '-'],
-            ['C3', '8c358f', 'accuracy', 'gen', '88.33'],
-            ['race-middle', '9a54b6', 'accuracy', 'gen', '90.32'],
-            ['--------- 推理 Reasoning ---------', '-', '-', '-', '-'],
-            ['cmnli', '1abf97', 'accuracy', 'gen', '38.26'],
-            ['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'],
-        ]
+        """Format in self.table [ ['dataset', 'version', 'metric', 'mode',
+        'model_name'], ['--------- 考试 Exam ---------', '-', '-', '-', '-'],
+        ['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'], ['ARC-e', '1e0de5',
+        'accuracy', 'gen', '85.36'], ['--------- 语言 Language ---------', '-',
+        '-', '-', '-'], ['WiC', 'd06864', 'accuracy', 'gen', '55.64'], ['chid-
+
+        dev', '211ee7', 'accuracy', 'gen', '52.97'], ['--------- 知识 Knowledge
+        ---------', '-', '-', '-', '-'], ['BoolQ', '883d50', 'accuracy', 'gen',
+        '86.06'], ['--------- 理解 Understanding ---------', '-', '-', '-', '-'],
+        ['C3', '8c358f', 'accuracy', 'gen', '88.33'], ['race-middle', '9a54b6',
+        'accuracy', 'gen', '90.32'], ['--------- 推理 Reasoning ---------', '-',
+        '-', '-', '-'], ['cmnli', '1abf97', 'accuracy', 'gen', '38.26'],
+        ['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'], ]
        """

        table = Table()
--- a/opencompass/summarizers/subjective/alpacaeval.py
+++ b/opencompass/summarizers/subjective/alpacaeval.py
@ -19,7 +19,8 @@ from .utils import get_judgeanswer_and_reference, get_outdir


 def post_process_alpacav1(completion: str):
-    r"""Parse a completion that contains a list of dictionary and returns the rank of the model1.
+    r"""Parse a completion that contains a list of dictionary and returns the
+    rank of the model1.

    Examples
    --------
@ -47,7 +48,8 @@ def post_process_alpacav1(completion: str):


 def post_process_alpacav2(completion: str):
-    r"""Parse a completion that contains 'm' or 'M' and returns the rank of the model1.
+    r"""Parse a completion that contains 'm' or 'M' and returns the rank of the
+    model1.

    Examples
    --------
--- a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
+++ b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
@ -61,7 +61,7 @@ def preprocess_for_elo(df):


 def preprocess_for_bt(df):
-    """in BT we only need the unique (matchup,outcome) sets along with the
+    """In BT we only need the unique (matchup,outcome) sets along with the
    weights of how often they occur."""
    n_rows = len(df)
    # the 3 columns of schedule represent: model_a id, model_b id, outcome_id
@ -179,7 +179,7 @@ def fit_vectorized_elo(
    init_rating: float = 1000.0,
    scale: float = 400.0,
 ):
-    """fit multiple sets of Elo ratings on different samples of the data at the
+    """Fit multiple sets of Elo ratings on different samples of the data at the
    same time."""
    alpha = math.log(base) / scale
    num_samples = sample_indices.shape[1]
@ -282,7 +282,7 @@ def scale_and_offset(
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
 ):
-    """convert ratings from the natural scale to the Elo rating scale with an
+    """Convert ratings from the natural scale to the Elo rating scale with an
    anchored baseline."""
    scaled_ratings = (ratings * scale) + init_rating

--- a/opencompass/summarizers/subjective/compassbench.py
+++ b/opencompass/summarizers/subjective/compassbench.py
@ -50,7 +50,6 @@ MAP = {
        'language/compassbench_v2501_language_en_nlp_sub',
        'language/compassbench_v2501_language_en_creation_sub',
    ],
-
    'code': [
        '总分',
        '中文总分',
--- a/opencompass/utils/lark.py
+++ b/opencompass/utils/lark.py
@ -15,8 +15,8 @@ class LarkReporter:
             title: Optional[str] = None):
        """Post a message to Lark.

-        When title is None, message must be a str.
-        otherwise msg can be in rich text format (see
+        When title is None, message must be a str. otherwise msg can be in rich
+        text format (see
        https://open.feishu.cn/document/uAjLw4CM/ukTMukTMukTM/im-v1/message/create_json#45e0953e
        for details).
        """