OpenCompass/.github/scripts/oc_score_baseline_fullbench.yaml

internlm2_5-7b-chat-hf_fullbench:
    race-high: 93.75
    ARC-c: 93.75
    BoolQ: 81.25
    triviaqa_wiki_1shot: 50
    nq_open_1shot: 25
    IFEval: 50
    drop: 81.25
    GPQA_diamond: 25
    hellaswag: 87.5
    TheoremQA: 18.75
    musr_average: 39.58
    gsm8k: 56.25
    math: 75
    cmo_fib: 6.25
    aime2024: 6.25
    wikibench-wiki-single_choice_cncircular: 50
    sanitized_mbpp: 68.75
    ds1000: 16.96
    lcb_code_generation: 12.5
    lcb_code_execution: 43.75
    lcb_test_output: 18.75
    bbh-logical_deduction_seven_objects: 50
    bbh-multistep_arithmetic_two: 68.75
    mmlu-other: 72.6
    cmmlu-china-specific: 76.25
    mmlu_pro_math: 25
    ds1000_Pandas: 12.5
    ds1000_Numpy: 0
    ds1000_Tensorflow: 12.5
    ds1000_Scipy: 18.75
    ds1000_Sklearn: 18.75
    ds1000_Pytorch: 12.5
    ds1000_Matplotlib: 43.75
    openai_mmmlu_lite_AR-XY: 37.5
    college: 12.5
    college_knowledge: 87.5
    Alignbench总分: 0.65
    Alignbench专业能力: 7.83
    AlpacaEvaltotal: 0
    AlpacaEvalhelpful_base: 0
    CompassArenacompassarena_language: 60
    CompassArenacompassarena_knowledge: 56
    CompassArenacompassarena_reason_v2: 50
    CompassArenacompassarena_math_v2: 53.5
    CompassArenacompassarena_creationv2_zh: 48.75
    Fofofofo_test_prompts: 1
    followbenchHSR_AVG: 1
    followbenchSSR_AVG: 1
    followbenchHSR_L1: 1
    followbenchHSR_L2: 1
    followbenchHSR_L3: 1
    followbenchHSR_L4: 1
    followbenchHSR_L5: 1
    followbenchSSR_L1: 1
    followbenchSSR_L2: 1
    followbenchSSR_L3: 1
    followbenchSSR_L4: 1
    followbenchSSR_L5: 1
    MTBench101average: 8.1
    Wildbenchscore: -3.3333333333333335

internlm2_5-7b-chat-turbomind_fullbench:
    race-high: 93.75
    ARC-c: 87.5
    BoolQ: 68.75
    triviaqa_wiki_1shot: 50
    nq_open_1shot: 25
    IFEval: 50
    drop: 75
    hellaswag: 81.25
    TheoremQA: 6.25
    musr_average: 39.58
    gsm8k: 68.75
    math: 75
    GPQA_diamond: 25
    cmo_fib: 6.25
    aime2024: 6.25
    wikibench-wiki-single_choice_cncircular: 25
    sanitized_mbpp: 68.75
    ds1000: 13.39
    lcb_code_generation: 12.5
    lcb_code_execution: 43.75
    lcb_test_output: 12.5
    bbh-logical_deduction_seven_objects: 56.25
    bbh-multistep_arithmetic_two: 68.75
    mmlu-other: 74.04
    cmmlu-china-specific: 76.25
    mmlu_pro_math: 25
    ds1000_Pandas: 0
    ds1000_Numpy: 0
    ds1000_Tensorflow: 12.5
    ds1000_Scipy: 18.75
    ds1000_Sklearn: 18.75
    ds1000_Pytorch: 6.25
    ds1000_Matplotlib: 37.5
    openai_mmmlu_lite_AR-XY: 37.5
    college: 0
    college_knowledge: 87.5
    Alignbench总分: 0.64
    Alignbench专业能力: 7.6
    AlpacaEvaltotal: 10
    AlpacaEvalhelpful_base: 10
    CompassArenacompassarena_language: 59
    CompassArenacompassarena_knowledge: 57
    CompassArenacompassarena_reason_v2: 49.5
    CompassArenacompassarena_math_v2: 51
    CompassArenacompassarena_creationv2_zh: 43.75
    Fofofofo_test_prompts: 1
    followbenchHSR_AVG: 1
    followbenchSSR_AVG: 1
    followbenchHSR_L1: 1
    followbenchHSR_L2: 1
    followbenchHSR_L3: 1
    followbenchHSR_L4: 1
    followbenchHSR_L5: 1
    followbenchSSR_L1: 1
    followbenchSSR_L2: 1
    followbenchSSR_L3: 1
    followbenchSSR_L4: 1
    followbenchSSR_L5: 1
    MTBench101average: 8.1
    Wildbenchscore: -8.333333333333334

internlm2_5-7b-hf_fullbench:
    race-high: 100
    ARC-c: 68.75
    BoolQ: 87.5
    GPQA_diamond: 62.5
    drop: 62.5
    math: 12.5
    wikibench-wiki-single_choice_cncircular: 25
    sanitized_mbpp: 56.25
    gsm8k: 37.5
    triviaqa_wiki_1shot: 43.75
    nq_open_1shot: 43.75
    winogrande: 75
    hellaswag: 93.75
    TheoremQA: 25
    dingo_en_192: 37.5
    dingo_zh_170: 100
    college: 12.5
    college_knowledge: 87.5
    bbh-logical_deduction_seven_objects: 43.75
    bbh-multistep_arithmetic_two: 56.25
    mmlu-other: 76.92
    cmmlu-china-specific: 84.17
    mmlu_pro_math: 18.75

internlm2_5-7b-turbomind_fullbench:
    race-high: 100
    ARC-c: 68.75
    BoolQ: 87.5
    GPQA_diamond: 62.5
    drop: 62.5
    math: 18.75
    wikibench-wiki-single_choice_cncircular: 25
    sanitized_mbpp: 56.25
    gsm8k: 68.75
    triviaqa_wiki_1shot: 43.75
    nq_open_1shot: 43.75
    winogrande: 87.5
    hellaswag: 93.75
    TheoremQA: 31.25
    dingo_en_192: 43.75
    dingo_zh_170: 100
    college: 12.5
    college_knowledge: 87.5
    bbh-logical_deduction_seven_objects: 50
    bbh-multistep_arithmetic_two: 56.25
    mmlu-other: 76.92
    cmmlu-china-specific: 84.17
    mmlu_pro_math: 18.75