diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py index 6e04afe3..7a6aaa25 100644 --- a/.github/scripts/eval_regression_chat_models.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -101,8 +101,6 @@ with read_base(): models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ - models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \ models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \ @@ -113,10 +111,15 @@ with read_base(): from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ models as \ lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \ + models as \ + lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \ models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \ + models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \ models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_5_mini_instruct import \ @@ -170,7 +173,7 @@ with read_base(): from ...volc import infer as volc_infer # noqa: F401, E501 -hf_glm4_9b_chat_model[0]['abbr'] = 'THUDM/glm-4-9b-chat-hf' +hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf' race_datasets = [race_datasets[1]] datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index c0e735fb..dd943e26 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench: college_knowledge_naive_average: 87.5 subjective: alignment_bench_v1_1_总分: 0.66 - alpaca_eval_total: 20 + alpaca_eval_total: 0 arenahard_score: 50 Followbench_naive_average: 1 CompassArena_naive_average: 43 mtbench101_avg: 7.8 - wildbench_average: -12.78 + wildbench_average: -15.56 simpleqa_accuracy_given_attempted: 0 chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 7.90 + alignment_bench_v1_1_专业能力: 8.00 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench: alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 20 + alpaca_eval_helpful_base: 0 compassarena_language_naive_average: 35 compassarena_knowledge_naive_average: 55 compassarena_reason_v2_naive_average: 40 @@ -115,16 +115,16 @@ internlm2_5-7b-chat-turbomind_fullbench: college_naive_average: 12.50 college_knowledge_naive_average: 87.5 subjective: - alignment_bench_v1_1_总分: 0.70 + alignment_bench_v1_1_总分: 0.66 alpaca_eval_total: 0 arenahard_score: 50 Followbench_naive_average: 1 - CompassArena_naive_average: 38 - mtbench101_avg: 7.80 - wildbench_average: -4.86 + CompassArena_naive_average: 40 + mtbench101_avg: 8 + wildbench_average: -6.81 simpleqa_accuracy_given_attempted: 0 chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 8.4 + alignment_bench_v1_1_专业能力: 7.9 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench: alignment_bench_v1_1_综合问答: 0 alpaca_eval_helpful_base: 0 compassarena_language_naive_average: 35 - compassarena_knowledge_naive_average: 50 - compassarena_reason_v2_naive_average: 30 - compassarena_math_v2_naive_average: 50 - compassarena_creationv2_zh_naive_average: 25 + compassarena_knowledge_naive_average: 45 + compassarena_reason_v2_naive_average: 25 + compassarena_math_v2_naive_average: 60 + compassarena_creationv2_zh_naive_average: 35 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 @@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind: alpaca_eval_koala: 28.21 alpaca_eval_oasst: 23.4 alpaca_eval_selfinstruct: 30.95 - alpaca_eval_vicuna: 25 + alpaca_eval_vicuna: 33.75 compassarena_language_naive_average: 52.5 compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py new file mode 100644 index 00000000..1ffef256 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py @@ -0,0 +1,22 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-8x22b-instruct-v0.1-turbomind', + path='mistralai/Mixtral-8x22B-Instruct-v0.1', + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=8, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=8), + ) +] diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py index ccb8d139..de917f44 100644 --- a/opencompass/summarizers/subjective/common_summarizer.py +++ b/opencompass/summarizers/subjective/common_summarizer.py @@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer): f.write(','.join(new_header) + '\n') for line in new_table: f.write(','.join(map(str, line)) + '\n') - print(t) print(output_file) return {'qa_bench_' + show_dataset_abbr:json_result}