from mmengine.config import read_base with read_base(): # datasets from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import ( commonsenseqa_datasets, ) from opencompass.configs.datasets.longbench.longbench import longbench_datasets from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets from opencompass.configs.datasets.humaneval.humaneval_gen import humaneval_datasets from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import chid_datasets from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import truthfulqa_datasets # models from opencompass.configs.models.hf_llama.hf_llama3_8b import models as hf_llama3_8b_model from opencompass.configs.models.qwen.hf_qwen2_7b import models as hf_qwen2_7b_model from opencompass.configs.models.others.hf_phi_2 import models as hf_phi_2_model datasets = sum( [v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [] ) models = sum([v for k, v in locals().items() if k.endswith('_model')], []) work_dir = './outputs/edgellm/' # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode phi-2_hf # ------------------------------------------- --------- ---------------- ------ ---------- # commonsense_qa c946f2 accuracy gen 65.19 # openai_humaneval 8e312c humaneval_pass@1 gen 30.49 # truthful_qa 5ddc62 rouge_max gen 0.08 # truthful_qa 5ddc62 rouge_diff gen -0.00 # truthful_qa 5ddc62 rouge_acc gen 0.41 # gsm8k 1d7fe4 accuracy gen 62.40 # chid-dev 211ee7 accuracy gen 12.87 # chid-test 211ee7 accuracy gen 14.34 # bbh - naive_average gen 59.50 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode Meta-Llama-3-8B_hf # ------------------------------------------- --------- ---------------- ------ -------------------- # commonsense_qa c946f2 accuracy gen 70.11 # openai_humaneval 8e312c humaneval_pass@1 gen 26.22 # truthful_qa 5ddc62 rouge_max gen 0.07 # truthful_qa 5ddc62 rouge_diff gen -0.01 # truthful_qa 5ddc62 rouge_acc gen 0.41 # gsm8k 1d7fe4 accuracy gen 55.80 # chid-dev 211ee7 accuracy gen 40.59 # chid-test 211ee7 accuracy gen 36.66 # bbh - naive_average gen 61.62 # 20240816_060452 # tabulate format # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # dataset version metric mode qwen2-7b-hf # -------------- --------- ---------- ------ ------------- # commonsense_qa 734a22 accuracy gen 65.19 # truthful_qa 5ddc62 rouge_max gen 0.08 # truthful_qa 5ddc62 rouge_diff gen -0.02 # truthful_qa 5ddc62 rouge_acc gen 0.44