2024-08-16 15:54:07 +08:00
|
|
|
from mmengine.config import read_base
|
|
|
|
|
|
|
|
with read_base():
|
|
|
|
# datasets
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \
|
|
|
|
commonsenseqa_datasets
|
|
|
|
from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \
|
|
|
|
chid_datasets
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.configs.datasets.humaneval.humaneval_gen import \
|
|
|
|
humaneval_datasets
|
|
|
|
from opencompass.configs.datasets.longbench.longbench import \
|
|
|
|
longbench_datasets
|
|
|
|
from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \
|
|
|
|
truthfulqa_datasets
|
2024-08-16 15:54:07 +08:00
|
|
|
# models
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
|
|
|
models as hf_llama3_8b_model
|
|
|
|
from opencompass.configs.models.others.hf_phi_2 import \
|
|
|
|
models as hf_phi_2_model
|
|
|
|
from opencompass.configs.models.qwen.hf_qwen2_7b import \
|
|
|
|
models as hf_qwen2_7b_model
|
2024-08-16 15:54:07 +08:00
|
|
|
|
2025-01-20 19:17:38 +08:00
|
|
|
datasets = sum([
|
|
|
|
v
|
|
|
|
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
|
|
|
|
], [])
|
2024-08-16 15:54:07 +08:00
|
|
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
|
|
|
work_dir = './outputs/edgellm/'
|
|
|
|
|
|
|
|
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
|
|
# dataset version metric mode phi-2_hf
|
|
|
|
# ------------------------------------------- --------- ---------------- ------ ----------
|
|
|
|
# commonsense_qa c946f2 accuracy gen 65.19
|
|
|
|
# openai_humaneval 8e312c humaneval_pass@1 gen 30.49
|
|
|
|
# truthful_qa 5ddc62 rouge_max gen 0.08
|
|
|
|
# truthful_qa 5ddc62 rouge_diff gen -0.00
|
|
|
|
# truthful_qa 5ddc62 rouge_acc gen 0.41
|
|
|
|
# gsm8k 1d7fe4 accuracy gen 62.40
|
|
|
|
# chid-dev 211ee7 accuracy gen 12.87
|
|
|
|
# chid-test 211ee7 accuracy gen 14.34
|
|
|
|
# bbh - naive_average gen 59.50
|
|
|
|
|
|
|
|
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
|
|
# dataset version metric mode Meta-Llama-3-8B_hf
|
|
|
|
# ------------------------------------------- --------- ---------------- ------ --------------------
|
|
|
|
# commonsense_qa c946f2 accuracy gen 70.11
|
|
|
|
# openai_humaneval 8e312c humaneval_pass@1 gen 26.22
|
|
|
|
# truthful_qa 5ddc62 rouge_max gen 0.07
|
|
|
|
# truthful_qa 5ddc62 rouge_diff gen -0.01
|
|
|
|
# truthful_qa 5ddc62 rouge_acc gen 0.41
|
|
|
|
# gsm8k 1d7fe4 accuracy gen 55.80
|
|
|
|
# chid-dev 211ee7 accuracy gen 40.59
|
|
|
|
# chid-test 211ee7 accuracy gen 36.66
|
|
|
|
# bbh - naive_average gen 61.62
|
|
|
|
# 20240816_060452
|
|
|
|
# tabulate format
|
|
|
|
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
|
|
# dataset version metric mode qwen2-7b-hf
|
|
|
|
# -------------- --------- ---------- ------ -------------
|
|
|
|
# commonsense_qa 734a22 accuracy gen 65.19
|
|
|
|
# truthful_qa 5ddc62 rouge_max gen 0.08
|
|
|
|
# truthful_qa 5ddc62 rouge_diff gen -0.02
|
|
|
|
# truthful_qa 5ddc62 rouge_acc gen 0.44
|