from mmengine.config import read_base with read_base(): # datasets from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \ commonsenseqa_datasets from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \ chid_datasets from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets from opencompass.configs.datasets.humaneval.humaneval_gen import \ humaneval_datasets from opencompass.configs.datasets.longbench.longbench import \ longbench_datasets from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \ truthfulqa_datasets # models from opencompass.configs.models.hf_llama.hf_llama3_8b import \ models as hf_llama3_8b_model from opencompass.configs.models.others.hf_phi_2 import \ models as hf_phi_2_model from opencompass.configs.models.qwen.hf_qwen2_7b import \ models as hf_qwen2_7b_model datasets = sum([ v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets' ], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], []) work_dir = './outputs/edgellm/' # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode phi-2_hf # ------------------------------------------- --------- ---------------- ------ ---------- # commonsense_qa c946f2 accuracy gen 65.19 # openai_humaneval 8e312c humaneval_pass@1 gen 30.49 # truthful_qa 5ddc62 rouge_max gen 0.08 # truthful_qa 5ddc62 rouge_diff gen -0.00 # truthful_qa 5ddc62 rouge_acc gen 0.41 # gsm8k 1d7fe4 accuracy gen 62.40 # chid-dev 211ee7 accuracy gen 12.87 # chid-test 211ee7 accuracy gen 14.34 # bbh - naive_average gen 59.50 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode Meta-Llama-3-8B_hf # ------------------------------------------- --------- ---------------- ------ -------------------- # commonsense_qa c946f2 accuracy gen 70.11 # openai_humaneval 8e312c humaneval_pass@1 gen 26.22 # truthful_qa 5ddc62 rouge_max gen 0.07 # truthful_qa 5ddc62 rouge_diff gen -0.01 # truthful_qa 5ddc62 rouge_acc gen 0.41 # gsm8k 1d7fe4 accuracy gen 55.80 # chid-dev 211ee7 accuracy gen 40.59 # chid-test 211ee7 accuracy gen 36.66 # bbh - naive_average gen 61.62 # 20240816_060452 # tabulate format # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode qwen2-7b-hf # -------------- --------- ---------- ------ ------------- # commonsense_qa 734a22 accuracy gen 65.19 # truthful_qa 5ddc62 rouge_max gen 0.08 # truthful_qa 5ddc62 rouge_diff gen -0.02 # truthful_qa 5ddc62 rouge_acc gen 0.44