mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
f8a60d36f4
commit
780bc1dd1e
@ -101,8 +101,6 @@ with read_base():
|
|||||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
|
|
||||||
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
||||||
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||||
@ -113,10 +111,15 @@ with read_base():
|
|||||||
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||||
models as \
|
models as \
|
||||||
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||||
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||||
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.phi.hf_phi_3_5_mini_instruct import \
|
from opencompass.configs.models.phi.hf_phi_3_5_mini_instruct import \
|
||||||
@ -170,7 +173,7 @@ with read_base():
|
|||||||
|
|
||||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
hf_glm4_9b_chat_model[0]['abbr'] = 'THUDM/glm-4-9b-chat-hf'
|
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||||
|
|
||||||
race_datasets = [race_datasets[1]]
|
race_datasets = [race_datasets[1]]
|
||||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
28
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
28
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.66
|
alignment_bench_v1_1_总分: 0.66
|
||||||
alpaca_eval_total: 20
|
alpaca_eval_total: 0
|
||||||
arenahard_score: 50
|
arenahard_score: 50
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 43
|
CompassArena_naive_average: 43
|
||||||
mtbench101_avg: 7.8
|
mtbench101_avg: 7.8
|
||||||
wildbench_average: -12.78
|
wildbench_average: -15.56
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 0
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
alignment_bench_v1_1_专业能力: 7.90
|
alignment_bench_v1_1_专业能力: 8.00
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
alignment_bench_v1_1_文本写作: 0
|
alignment_bench_v1_1_文本写作: 0
|
||||||
alignment_bench_v1_1_角色扮演: 0
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 20
|
alpaca_eval_helpful_base: 0
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 55
|
compassarena_knowledge_naive_average: 55
|
||||||
compassarena_reason_v2_naive_average: 40
|
compassarena_reason_v2_naive_average: 40
|
||||||
@ -115,16 +115,16 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
college_naive_average: 12.50
|
college_naive_average: 12.50
|
||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.70
|
alignment_bench_v1_1_总分: 0.66
|
||||||
alpaca_eval_total: 0
|
alpaca_eval_total: 0
|
||||||
arenahard_score: 50
|
arenahard_score: 50
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 38
|
CompassArena_naive_average: 40
|
||||||
mtbench101_avg: 7.80
|
mtbench101_avg: 8
|
||||||
wildbench_average: -4.86
|
wildbench_average: -6.81
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 0
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
alignment_bench_v1_1_专业能力: 8.4
|
alignment_bench_v1_1_专业能力: 7.9
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 0
|
alpaca_eval_helpful_base: 0
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 50
|
compassarena_knowledge_naive_average: 45
|
||||||
compassarena_reason_v2_naive_average: 30
|
compassarena_reason_v2_naive_average: 25
|
||||||
compassarena_math_v2_naive_average: 50
|
compassarena_math_v2_naive_average: 60
|
||||||
compassarena_creationv2_zh_naive_average: 25
|
compassarena_creationv2_zh_naive_average: 35
|
||||||
followbench_llmeval_en_HSR_AVG: 1
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
followbench_llmeval_en_SSR_AVG: 1
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
followbench_llmeval_en_HSR_L1: 1
|
followbench_llmeval_en_HSR_L1: 1
|
||||||
@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
alpaca_eval_koala: 28.21
|
alpaca_eval_koala: 28.21
|
||||||
alpaca_eval_oasst: 23.4
|
alpaca_eval_oasst: 23.4
|
||||||
alpaca_eval_selfinstruct: 30.95
|
alpaca_eval_selfinstruct: 30.95
|
||||||
alpaca_eval_vicuna: 25
|
alpaca_eval_vicuna: 33.75
|
||||||
compassarena_language_naive_average: 52.5
|
compassarena_language_naive_average: 52.5
|
||||||
compassarena_knowledge_naive_average: 36
|
compassarena_knowledge_naive_average: 36
|
||||||
compassarena_reason_v2_naive_average: 35
|
compassarena_reason_v2_naive_average: 35
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
from opencompass.models import TurboMindModelwithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModelwithChatTemplate,
|
||||||
|
abbr='mixtral-8x22b-instruct-v0.1-turbomind',
|
||||||
|
path='mistralai/Mixtral-8x22B-Instruct-v0.1',
|
||||||
|
engine_config=dict(
|
||||||
|
session_len=32768,
|
||||||
|
max_batch_size=16,
|
||||||
|
tp=8,
|
||||||
|
cache_max_entry_count=0.7,
|
||||||
|
),
|
||||||
|
gen_config=dict(
|
||||||
|
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||||
|
),
|
||||||
|
max_seq_len=32768,
|
||||||
|
max_out_len=4096,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=8),
|
||||||
|
)
|
||||||
|
]
|
@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
|
|||||||
f.write(','.join(new_header) + '\n')
|
f.write(','.join(new_header) + '\n')
|
||||||
for line in new_table:
|
for line in new_table:
|
||||||
f.write(','.join(map(str, line)) + '\n')
|
f.write(','.join(map(str, line)) + '\n')
|
||||||
print(t)
|
|
||||||
print(output_file)
|
print(output_file)
|
||||||
return {'qa_bench_' + show_dataset_abbr:json_result}
|
return {'qa_bench_' + show_dataset_abbr:json_result}
|
||||||
|
Loading…
Reference in New Issue
Block a user