OpenCompass/examples/eval_edgellm_demo.py

from mmengine.config import read_base

with read_base():
    # datasets
    from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
    from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \
        commonsenseqa_datasets
    from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \
        chid_datasets
    from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
    from opencompass.configs.datasets.humaneval.humaneval_gen import \
        humaneval_datasets
    from opencompass.configs.datasets.longbench.longbench import \
        longbench_datasets
    from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \
        truthfulqa_datasets
    # models
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model
    from opencompass.configs.models.others.hf_phi_2 import \
        models as hf_phi_2_model
    from opencompass.configs.models.qwen.hf_qwen2_7b import \
        models as hf_qwen2_7b_model

datasets = sum([
    v
    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/edgellm/'

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset                                      version    metric            mode      phi-2_hf
# -------------------------------------------  ---------  ----------------  ------  ----------
# commonsense_qa                               c946f2     accuracy          gen          65.19
# openai_humaneval                             8e312c     humaneval_pass@1  gen          30.49
# truthful_qa                                  5ddc62     rouge_max         gen           0.08
# truthful_qa                                  5ddc62     rouge_diff        gen          -0.00
# truthful_qa                                  5ddc62     rouge_acc         gen           0.41
# gsm8k                                        1d7fe4     accuracy          gen          62.40
# chid-dev                                     211ee7     accuracy          gen          12.87
# chid-test                                    211ee7     accuracy          gen          14.34
# bbh                                          -          naive_average     gen          59.50

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset                                      version    metric            mode      Meta-Llama-3-8B_hf
# -------------------------------------------  ---------  ----------------  ------  --------------------
# commonsense_qa                               c946f2     accuracy          gen                     70.11
# openai_humaneval                             8e312c     humaneval_pass@1  gen                    26.22
# truthful_qa                                  5ddc62     rouge_max         gen                     0.07
# truthful_qa                                  5ddc62     rouge_diff        gen                    -0.01
# truthful_qa                                  5ddc62     rouge_acc         gen                     0.41
# gsm8k                                        1d7fe4     accuracy          gen                    55.80
# chid-dev                                     211ee7     accuracy          gen                    40.59
# chid-test                                    211ee7     accuracy          gen                    36.66
# bbh                                          -          naive_average     gen                    61.62
# 20240816_060452
# tabulate format
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset         version    metric      mode      qwen2-7b-hf
# --------------  ---------  ----------  ------  -------------
# commonsense_qa  734a22     accuracy    gen             65.19
# truthful_qa     5ddc62     rouge_max   gen              0.08
# truthful_qa     5ddc62     rouge_diff  gen             -0.02
# truthful_qa     5ddc62     rouge_acc   gen              0.44
[Bug] Commonsenseqa dataset fix (#1425) * longbench dataset load fix * update * Update * Update * Update * update * update --------- Co-authored-by: tonysy <sy.zhangbuaa@gmail.com> 2024-08-16 15:54:07 +08:00			`from mmengine.config import read_base`

			`with read_base():`
			`# datasets`
[Doc] Update Readme (#1439) * update * update * update * update * update * update * update * update * update * update * update * update 2024-08-22 14:48:45 +08:00			`from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets`
[Refactor] Code refactoarization (#1831) * Update * fix lint * update * fix lint 2025-01-20 19:17:38 +08:00			`from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \`
			`commonsenseqa_datasets`
			`from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \`
			`chid_datasets`
[Doc] Update Readme (#1439) * update * update * update * update * update * update * update * update * update * update * update * update 2024-08-22 14:48:45 +08:00			`from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets`
[Refactor] Code refactoarization (#1831) * Update * fix lint * update * fix lint 2025-01-20 19:17:38 +08:00			`from opencompass.configs.datasets.humaneval.humaneval_gen import \`
			`humaneval_datasets`
			`from opencompass.configs.datasets.longbench.longbench import \`
			`longbench_datasets`
			`from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \`
			`truthfulqa_datasets`
[Bug] Commonsenseqa dataset fix (#1425) * longbench dataset load fix * update * Update * Update * Update * update * update --------- Co-authored-by: tonysy <sy.zhangbuaa@gmail.com> 2024-08-16 15:54:07 +08:00			`# models`
[Refactor] Code refactoarization (#1831) * Update * fix lint * update * fix lint 2025-01-20 19:17:38 +08:00			`from opencompass.configs.models.hf_llama.hf_llama3_8b import \`
			`models as hf_llama3_8b_model`
			`from opencompass.configs.models.others.hf_phi_2 import \`
			`models as hf_phi_2_model`
			`from opencompass.configs.models.qwen.hf_qwen2_7b import \`
			`models as hf_qwen2_7b_model`
[Bug] Commonsenseqa dataset fix (#1425) * longbench dataset load fix * update * Update * Update * Update * update * update --------- Co-authored-by: tonysy <sy.zhangbuaa@gmail.com> 2024-08-16 15:54:07 +08:00
[Refactor] Code refactoarization (#1831) * Update * fix lint * update * fix lint 2025-01-20 19:17:38 +08:00			`datasets = sum([`
			`v`
			`for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'`
			`], [])`
[Bug] Commonsenseqa dataset fix (#1425) * longbench dataset load fix * update * Update * Update * Update * update * update --------- Co-authored-by: tonysy <sy.zhangbuaa@gmail.com> 2024-08-16 15:54:07 +08:00			`models = sum([v for k, v in locals().items() if k.endswith('_model')], [])`
			`work_dir = './outputs/edgellm/'`

			`# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^`
			`# dataset version metric mode phi-2_hf`
			`# ------------------------------------------- --------- ---------------- ------ ----------`
			`# commonsense_qa c946f2 accuracy gen 65.19`
			`# openai_humaneval 8e312c humaneval_pass@1 gen 30.49`
			`# truthful_qa 5ddc62 rouge_max gen 0.08`
			`# truthful_qa 5ddc62 rouge_diff gen -0.00`
			`# truthful_qa 5ddc62 rouge_acc gen 0.41`
			`# gsm8k 1d7fe4 accuracy gen 62.40`
			`# chid-dev 211ee7 accuracy gen 12.87`
			`# chid-test 211ee7 accuracy gen 14.34`
			`# bbh - naive_average gen 59.50`

			`# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^`
			`# dataset version metric mode Meta-Llama-3-8B_hf`
			`# ------------------------------------------- --------- ---------------- ------ --------------------`
			`# commonsense_qa c946f2 accuracy gen 70.11`
			`# openai_humaneval 8e312c humaneval_pass@1 gen 26.22`
			`# truthful_qa 5ddc62 rouge_max gen 0.07`
			`# truthful_qa 5ddc62 rouge_diff gen -0.01`
			`# truthful_qa 5ddc62 rouge_acc gen 0.41`
			`# gsm8k 1d7fe4 accuracy gen 55.80`
			`# chid-dev 211ee7 accuracy gen 40.59`
			`# chid-test 211ee7 accuracy gen 36.66`
			`# bbh - naive_average gen 61.62`
			`# 20240816_060452`
			`# tabulate format`
			`# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^`
			`# dataset version metric mode qwen2-7b-hf`
			`# -------------- --------- ---------- ------ -------------`
			`# commonsense_qa 734a22 accuracy gen 65.19`
			`# truthful_qa 5ddc62 rouge_max gen 0.08`
			`# truthful_qa 5ddc62 rouge_diff gen -0.02`
			`# truthful_qa 5ddc62 rouge_acc gen 0.44`