Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2024-11-25 10:14:43 +08:00 committed by GitHub
commit 64a34bccaf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
60 changed files with 5191 additions and 558 deletions

39
.github/scripts/eval_regression_api.py vendored Normal file
View File

@ -0,0 +1,39 @@
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://localhost:23333/v1',
path='internlm2',
tokenizer_path='internlm/internlm2_5-7b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=20,
)
]

View File

@ -2,15 +2,21 @@ from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_ppl import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
@ -19,34 +25,58 @@ with read_base():
models as hf_gemma2_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b import \
models as hf_gemma2_9b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b import \
models as hf_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_2b import \
models as vllm_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b import \
models as vllm_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
models as hf_internlm2_base_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
models as hf_qwen_2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
models as hf_qwen_2_5_14b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -65,11 +95,27 @@ with read_base():
models as hf_yi_1_5_6b_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'
d['reader_cfg']['test_range'] = '[0:32]'
for m in models:
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
summarizer = dict(
dataset_abbrs=[
['gsm8k', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,188 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.dingo.dingo_gen import \
datasets as dingo_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
gpqa_datasets # noqa: F401, E501
# Corebench v1.7
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_few_shot_ppl import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
race_datasets = [race_datasets[1]] # Only take RACE-High
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
summarizer = dict(
dataset_abbrs=[
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['drop', 'accuracy'],
['bbh', 'naive_average'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['dingo_en_192', 'score'],
['dingo_zh_170', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
['mmlu-other', 'accuracy'],
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
['cmmlu-china-specific', 'accuracy'],
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])

View File

@ -1,7 +1,5 @@
from mmengine.config import read_base
from opencompass.models import OpenAISDK
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
@ -29,6 +27,12 @@ with read_base():
models as hf_gemma2_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
models as hf_gemma2_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
models as vllm_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -51,18 +55,37 @@ with read_base():
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
models as hf_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
models as \
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
models as hf_minicpm3_4b_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -73,6 +96,10 @@ with read_base():
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -89,10 +116,8 @@ with read_base():
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
api_meta_template = dict(
@ -103,25 +128,24 @@ api_meta_template = dict(
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
model_name = ''
models.append(
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://judgemodel:10001/v1',
path='compass_judger_internlm2_102b_0508',
tokenizer_path='internlm/internlm2_5-20b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=50,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=3,
))
for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'
d['reader_cfg']['test_range'] = '[0:32]'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
summarizer = dict(
dataset_abbrs=[
'gsm8k',
'race-middle',
'race-high',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,300 @@
from mmengine.config import read_base
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
cmo_fib_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
ds1000_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
# new datasets in Fullbench v1.1
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
mmmlu_lite_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
SciCode_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
teval_datasets as teval_en_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
teval_datasets as teval_zh_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.ds1000 import \
ds1000_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import \
humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.musr_average import \
summarizer as musr_summarizer # noqa: F401, E501
from opencompass.configs.summarizers.groups.scicode import \
scicode_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.teval import \
teval_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501
# For HumanEval-X Evaluation
# Apply the evaluator ip_address and port
race_datasets = [race_datasets[1]]
for item in humanevalx_datasets:
item['eval_cfg']['evaluator'][
'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
item['eval_cfg']['evaluator']['port'] = ''
# For DS-1000 Evaluation
# Apply the evaluator ip_address and port
for item in ds1000_datasets:
item['eval_cfg']['evaluator'][
'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
item['eval_cfg']['evaluator']['port'] = ''
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mmmlu_lite_datasets = [
x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')
and 'scicode' not in k.lower() and 'teval' not in k),
[],
)
datasets += teval_en_datasets
datasets += teval_zh_datasets
# datasets += SciCode_datasets
musr_summary_groups = musr_summarizer['summary_groups']
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
# Summarizer
summarizer = dict(
dataset_abbrs=[
'Language',
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['mmmlu_lite', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['musr_average', 'naive_average'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['math', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'],
'',
'Agent',
['teval', 'naive_average'],
['SciCode', 'accuracy'],
['SciCode', 'sub_accuracy'],
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'',
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'mmmlu_lite',
'openai_mmmlu_lite_AR-XY',
'openai_mmmlu_lite_BN-BD',
'openai_mmmlu_lite_DE-DE',
'openai_mmmlu_lite_ES-LA',
'openai_mmmlu_lite_FR-FR',
'openai_mmmlu_lite_HI-IN',
'openai_mmmlu_lite_ID-ID',
'openai_mmmlu_lite_IT-IT',
'openai_mmmlu_lite_JA-JP',
'openai_mmmlu_lite_KO-KR',
'openai_mmmlu_lite_PT-BR',
'openai_mmmlu_lite_SW-KE',
'openai_mmmlu_lite_YO-NG',
'openai_mmmlu_lite_ZH-CN',
'',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
],
summary_groups=summary_groups,
)
for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])

View File

@ -0,0 +1,70 @@
from copy import deepcopy
from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
judge_models = deepcopy([models[1]])
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)

View File

@ -7,36 +7,56 @@ import yaml
output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind',
'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind',
'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy',
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf',
'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind',
'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf',
'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind',
'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm',
'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
'lmdeploy-api-test'
'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf',
'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf',
'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf',
'qwen2.5-14b-instruct-turbomind'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']
@pytest.fixture()
def baseline_scores_testrange(request):
config_path = os.path.join(
request.config.rootdir,
'.github/scripts/oc_score_baseline_testrange.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
return config
@pytest.fixture()
@ -48,6 +68,16 @@ def baseline_scores(request):
return config
@pytest.fixture()
def baseline_scores_fullbench(request):
config_path = os.path.join(
request.config.rootdir,
'.github/scripts/oc_score_baseline_fullbench.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
return config
@pytest.fixture()
def result_scores():
file = find_csv_files(output_path)
@ -57,100 +87,231 @@ def result_scores():
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat
class TestChat:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2)
for p1 in chat_model_list
for p2 in dataset_list])
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
base_score = baseline_scores.get(model).get(dataset)
@pytest.mark.parametrize('model, dataset',
[(p1, p2) for p1 in chat_model_list
for p2 in ['gsm8k', 'race-high']])
def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset):
base_score = baseline_scores_testrange.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base
class TestBase:
"""Test cases for base model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2)
for p1 in base_model_list
for p2 in dataset_list])
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in base_model_list
for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']])
def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset):
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k':
return
base_score = baseline_scores.get(model).get(dataset)
base_score = baseline_scores_testrange.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_obj_fullbench
class TestChatObjFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
'college', 'college_knowledge'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.chat_sub_fullbench
class TestChatSubFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [
'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal',
'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language',
'CompassArenacompassarena_knowledge',
'CompassArenacompassarena_reason_v2',
'CompassArenacompassarena_math_v2',
'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts',
'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1',
'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4',
'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2',
'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5',
'MTBench101average', 'Wildbenchscore'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.base_fullbench
class TestBaseFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
'college_knowledge', 'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
'mmlu_pro_math'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
@pytest.mark.api
class TestApibench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset',
[('lmdeploy-api-test', 'race-middle'),
('lmdeploy-api-test', 'race-high'),
('lmdeploy-api-test', 'gsm8k')])
def test_api(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
class TestCmdCase:
@pytest.mark.case1
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'),
('internlm2_5-7b-hf', 'race-high')])
def test_cmd_case1(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-hf', 'race-high'),
('internlm2_5-7b-hf', 'demo_gsm8k'),
('internlm2-1.8b-hf', 'race-middle'),
('internlm2-1.8b-hf', 'race-high'),
('internlm2-1.8b-hf', 'demo_gsm8k')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
@pytest.mark.case2
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat-lmdeploy', 'race-middle'),
('internlm2_5-7b-chat-lmdeploy', 'race-high')])
def test_cmd_case2(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-chat-lmdeploy', 'race-high'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle'),
('internlm2-chat-1.8b-lmdeploy', 'race-high'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.case3
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'),
('internlm2_5-7b_hf', 'race-high')])
def test_cmd_case3(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b_hf', 'race-high'),
('internlm2_5-7b_hf', 'demo_gsm8k')])
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
@pytest.mark.case4
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat_hf', 'race-middle'),
('internlm2_5-7b-chat_hf', 'race-high')])
def test_cmd_case4(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
('internlm2_5-7b-chat_hf', 'race-high'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
assert_score(model, result_score, base_score)
def assert_score(score, baseline):
THRESHOLD = 3
def assert_score(model_type, score, baseline):
if score is None or score == '-':
assert False, 'value is none'
if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
print(score + ' between ' + str(baseline - 5) + ' and ' +
str(baseline + 5))
assert True
if 'batch' not in model_type:
if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
0.01):
print(' '.join([score, 'is equal', str(baseline)]))
assert True
else:
print(' '.join([score, 'is not equal', str(baseline)]))
assert False, ' '.join([score, 'is not equal', str(baseline)])
else:
assert False, score + ' not between ' + str(
baseline - 5) + ' and ' + str(baseline + 5)
if float(score) <= (baseline + THRESHOLD) and float(score) >= (
baseline - THRESHOLD):
print(' '.join([
score, 'is between',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
]))
assert True
else:
print(' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
]))
assert False, ' '.join([
score, 'is not etween',
str(baseline - THRESHOLD), 'and',
str(baseline + THRESHOLD)
])
def find_csv_files(directory):
csv_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.csv'):
if file.endswith('.csv') and (file.startswith('summary') or
file.startswith('Subjective_all')):
csv_files.append(os.path.join(root, file))
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -163,14 +324,24 @@ def read_csv_file(file_path):
with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
filtered_data = []
for row in reader:
filtered_row = {
k: v
for k, v in row.items()
if k not in ['version', 'metric', 'mode']
}
filtered_data.append(filtered_row)
if 'Subjective_all' not in file_path:
for row in reader:
if row['metric'] is not None and 'bpb' not in row['metric']:
filtered_row = {
k: v
for k, v in row.items()
if k not in ['version', 'metric', 'mode']
}
filtered_data.append(filtered_row)
else:
for row in reader:
if row['Detailed Scores'] is not None:
filtered_row = row
filtered_row['dataset'] = filtered_row[
'Dataset'] + filtered_row['Detailed Scores']
del filtered_row['Dataset']
del filtered_row['Detailed Scores']
filtered_data.append(filtered_row)
result = {}
for data in filtered_data:

View File

@ -1,369 +1,34 @@
baichuan2-7b-chat-hf:
gsm8k: 30
race-middle: 74
race-high: 79
internlm2_5-7b-hf:
demo_gsm8k: 42.19
race-middle: 91.78
race-high: 90.02
glm-4-9b-chat-hf:
gsm8k: 75
race-middle: 88
race-high: 88
internlm2_5-7b_hf:
demo_gsm8k: 42.19
race-middle: 91.78
race-high: 90.02
glm-4-9b-chat-turbomind:
gsm8k: 69
race-middle: 82
race-high: 77
internlm2-1.8b-hf:
demo_gsm8k: 15.62
race-middle: 71.66
race-high: 66.38
glm-4-9b-chat-vllm:
gsm8k: 73
race-middle: 87
race-high: 87
internlm2_5-7b-chat-lmdeploy:
demo_gsm8k: 84.38
race-middle: 92.76
race-high: 90.54
deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
race-high: 80
internlm2-chat-1.8b-lmdeploy:
demo_gsm8k: 31
race-middle: 81.34
race-high: 73.96
deepseek-moe-16b-chat-hf:
gsm8k: 62
race-middle: 62
race-high: 70
deepseek-v2-lite-chat-hf:
gsm8k: 59
race-middle: 82
race-high: 79
deepseek-7b-chat-vllm:
gsm8k: 63
race-middle: 74
race-high: 79
gemma-2b-it-hf:
gsm8k: 14
race-middle: 62
race-high: 52
gemma-7b-it-hf:
gsm8k: 39
race-middle: 74
race-high: 71
gemma-7b-it-vllm:
gsm8k: 38
race-middle: 75
race-high: 70
gemma2-2b-it-hf:
gsm8k: 62
race-middle: 75
race-high: 67
gemma2-9b-it-hf:
gsm8k: 80
race-middle: 89
race-high: 85
internlm2_5-7b-chat-hf:
gsm8k: 86
race-middle: 92
race-high: 93
internlm2_5-20b-chat-hf:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2_5-7b-chat-turbomind:
gsm8k: 87
race-middle: 92
race-high: 93
internlm2_5-20b-chat-turbomind:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
race-high: 83
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 34
race-middle: 81
race-high: 83
internlm2-chat-7b-lmdeploy:
gsm8k: 69
race-middle: 90
race-high: 88
internlm2-chat-7b-sft-turbomind:
gsm8k: 71
race-middle: 91
race-high: 92
internlm2-chat-7b-vllm:
gsm8k: 63
race-middle: 90
race-high: 91
llama-3_1-8b-instruct-hf:
gsm8k: 82
race-middle: 82
race-high: 88
llama-3-8b-instruct-hf:
gsm8k: 77
race-middle: 85
race-high: 87
llama-3_1-8b-instruct-turbomind:
gsm8k: 79
race-middle: 82
race-high: 88
llama-3-8b-instruct-turbomind:
gsm8k: 77
race-middle: 85
race-high: 89
mistral-7b-instruct-v0.2-hf:
gsm8k: 48
race-middle: 82
race-high: 78
mistral-7b-instruct-v0.3-hf:
gsm8k: 53
race-middle: 80
race-high: 78
mistral-7b-instruct-v0.2-vllm:
gsm8k: 49
race-middle: 81
race-high: 77
minicpm-2b-dpo-fp32-hf:
gsm8k: 58
race-middle: 66
race-high: 74
minicpm-2b-sft-bf16-hf:
gsm8k: 58
race-middle: 75
race-high: 81
minicpm-2b-sft-fp32-hf:
gsm8k: 58
race-middle: 75
race-high: 81
phi-3-mini-4k-instruct-hf:
gsm8k: 67
race-middle: 81
race-high: 84
phi-3-small-8k-instruct-hf:
gsm8k: 88
race-middle: 89
race-high: 88
qwen1.5-0.5b-chat-hf:
gsm8k: 5
race-middle: 55
race-high: 50
qwen2-1.5b-instruct-hf:
gsm8k: 63
race-middle: 77
race-high: 86
qwen2-1.5b-instruct-turbomind:
gsm8k: 60
race-middle: 77
race-high: 86
qwen2-7b-instruct-turbomind:
gsm8k: 88
race-middle: 87
race-high: 89
qwen2-7b-instruct-hf:
gsm8k: 85
race-middle: 87
race-high: 91
qwen1.5-0.5b-chat-vllm:
gsm8k: 5
race-middle: 57
race-high: 51
yi-1.5-6b-chat-hf:
gsm8k: 72
race-middle: 88
race-high: 86
yi-1.5-9b-chat-hf:
gsm8k: 81
race-middle: 89
race-high: 91
internlm2_5-7b-chat_hf:
demo_gsm8k: 87.50
race-middle: 92.76
race-high: 90.48
lmdeploy-api-test:
gsm8k: 90
race-middle: 95
race-high: 96
deepseek-moe-16b-base-hf:
gsm8k: 25
race-middle: 35
race-high: 23
deepseek-v2-lite-hf:
gsm8k: 37
race-middle: 56
race-high: 62
deepseek-7b-base-turbomind:
gsm8k: 21
race-middle: 42
race-high: 42
deepseek-moe-16b-base-vllm:
gsm8k: 22
race-middle: 35
race-high: 20
gemma-2b-hf:
gsm8k: 19
race-middle: 33
race-high: 26
gemma-7b-hf:
gsm8k: 65
race-middle: 59
race-high: 66
gemma2-2b-hf:
gsm8k: 33
race-middle: 56
race-high: 58
gemma2-9b-hf:
gsm8k: 70
race-middle: 82
race-high: 84
internlm2_5-7b-hf:
gsm8k: 47
race-middle: 92
race-high: 91
internlm2-7b-hf:
gsm8k: 65
race-middle: 77
race-high: 72
internlm2-base-7b-hf:
gsm8k: 5
race-middle: 71
race-high: 74
internlm2_5-7b-turbomind:
gsm8k: 73
race-middle: 90
race-high: 91
internlm2-1.8b-turbomind:
gsm8k: 25
race-middle: 75
race-high: 72
internlm2-7b-turbomind:
gsm8k: 67
race-middle: 78
race-high: 76
internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75
race-high: 81
llama-2-7b-hf:
gsm8k: 17
race-middle: 32
race-high: 38
llama-3-8b-hf:
gsm8k: 48
race-middle: 64
race-high: 70
llama-3.1-8b-turbomind:
gsm8k: 57
race-middle: 67
race-high: 75
llama-3-8b-turbomind:
gsm8k: 52
race-middle: 63
race-high: 70
mistral-7b-v0.2-hf:
gsm8k: 43
race-middle: 42
race-high: 60
mistral-7b-v0.3-hf:
gsm8k: 43
race-middle: 42
race-high: 60
mistral-7b-v0.2-vllm:
gsm8k: 45
race-middle: 42
race-high: 58
qwen1.5-moe-a2.7b-hf:
gsm8k: 64
race-middle: 78
race-high: 90
qwen2-1.5b-hf:
gsm8k: 58
race-middle: 65
race-high: 78
qwen2-0.5b-hf:
gsm8k: 35
race-middle: 52
race-high: 48
qwen2-7b-hf:
gsm8k: 82
race-middle: 88
race-high: 89
qwen2-1.5b-turbomind:
gsm8k: 57
race-middle: 64
race-high: 78
qwen2-7b-turbomind:
gsm8k: 83
race-middle: 88
race-high: 88
qwen1.5-0.5b-vllm:
gsm8k: 12
race-middle: 54
race-high: 59
yi-1.5-6b-hf:
gsm8k: 59
race-middle: 81
race-high: 89
yi-1.5-9b-hf:
gsm8k: 77
race-middle: 90
race-high: 90
gsm8k: 83.78
race-middle: 92.41
race-high: 90.37

View File

@ -0,0 +1,173 @@
internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75
ARC-c: 93.75
BoolQ: 81.25
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 81.25
GPQA_diamond: 25
hellaswag: 87.5
TheoremQA: 18.75
musr_average: 39.58
gsm8k: 56.25
math: 75
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 50
sanitized_mbpp: 68.75
ds1000: 16.96
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 18.75
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 72.6
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 12.5
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 12.5
ds1000_Matplotlib: 43.75
openai_mmmlu_lite_AR-XY: 37.5
college: 12.5
college_knowledge: 87.5
Alignbench总分: 0.65
Alignbench专业能力: 7.83
AlpacaEvaltotal: 0
AlpacaEvalhelpful_base: 0
CompassArenacompassarena_language: 60
CompassArenacompassarena_knowledge: 56
CompassArenacompassarena_reason_v2: 50
CompassArenacompassarena_math_v2: 53.5
CompassArenacompassarena_creationv2_zh: 48.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -3.3333333333333335
internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75
ARC-c: 87.5
BoolQ: 68.75
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 75
hellaswag: 81.25
TheoremQA: 6.25
musr_average: 39.58
gsm8k: 68.75
math: 75
GPQA_diamond: 25
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 68.75
ds1000: 13.39
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 12.5
bbh-logical_deduction_seven_objects: 56.25
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 74.04
cmmlu-china-specific: 76.25
mmlu_pro_math: 25
ds1000_Pandas: 0
ds1000_Numpy: 0
ds1000_Tensorflow: 12.5
ds1000_Scipy: 18.75
ds1000_Sklearn: 18.75
ds1000_Pytorch: 6.25
ds1000_Matplotlib: 37.5
openai_mmmlu_lite_AR-XY: 37.5
college: 0
college_knowledge: 87.5
Alignbench总分: 0.64
Alignbench专业能力: 7.6
AlpacaEvaltotal: 10
AlpacaEvalhelpful_base: 10
CompassArenacompassarena_language: 59
CompassArenacompassarena_knowledge: 57
CompassArenacompassarena_reason_v2: 49.5
CompassArenacompassarena_math_v2: 51
CompassArenacompassarena_creationv2_zh: 43.75
Fofofofo_test_prompts: 1
followbenchHSR_AVG: 1
followbenchSSR_AVG: 1
followbenchHSR_L1: 1
followbenchHSR_L2: 1
followbenchHSR_L3: 1
followbenchHSR_L4: 1
followbenchHSR_L5: 1
followbenchSSR_L1: 1
followbenchSSR_L2: 1
followbenchSSR_L3: 1
followbenchSSR_L4: 1
followbenchSSR_L5: 1
MTBench101average: 8.1
Wildbenchscore: -8.333333333333334
internlm2_5-7b-hf_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 12.5
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 37.5
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 75
hellaswag: 93.75
TheoremQA: 25
dingo_en_192: 37.5
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 43.75
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75
internlm2_5-7b-turbomind_fullbench:
race-high: 100
ARC-c: 68.75
BoolQ: 87.5
GPQA_diamond: 62.5
drop: 62.5
math: 18.75
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 56.25
gsm8k: 68.75
triviaqa_wiki_1shot: 43.75
nq_open_1shot: 43.75
winogrande: 87.5
hellaswag: 93.75
TheoremQA: 31.25
dingo_en_192: 43.75
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 56.25
mmlu-other: 76.92
cmmlu-china-specific: 84.17
mmlu_pro_math: 18.75

View File

@ -0,0 +1,459 @@
baichuan2-7b-chat-hf:
gsm8k: 18.75
race-high: 78.12
glm-4-9b-chat-hf:
gsm8k: 68.75
race-high: 90.62
glm-4-9b-chat-turbomind:
gsm8k: 75.00
race-high: 90.62
glm-4-9b-chat-vllm:
gsm8k: 65.62
race-high: 90.62
deepseek-7b-chat-hf:
gsm8k: 46.88
race-high: 81.25
deepseek-moe-16b-chat-hf:
gsm8k: 50
race-high: 68.75
deepseek-7b-chat-vllm:
gsm8k: 43.75
race-high: 75
gemma2-2b-it-hf:
gsm8k: 50
race-high: 71.88
gemma2-9b-it-hf:
gsm8k: 71.88
race-high: 84.38
gemma-2b-it-hf:
gsm8k: 3.12
race-high: 40.62
gemma-7b-it-hf:
gsm8k: 40.62
race-high: 68.75
gemma-2-9b-it-turbomind:
gsm8k: 65.62
race-high: 84.38
gemma-7b-it-vllm:
gsm8k: 34.38
race-high: 68.75
internlm2_5-7b-chat-hf:
gsm8k: 84.38
race-high: 90.62
internlm2_5-7b-chat-turbomind:
gsm8k: 84.38
race-high: 90.62
internlm2-chat-1.8b-turbomind:
gsm8k: 25
race-high: 84.38
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 21.88
race-high: 84.38
internlm2-chat-7b-lmdeploy:
gsm8k: 53.12
race-high: 84.38
internlm2-chat-7b-sft-turbomind:
gsm8k: 50
race-high: 90.62
internlm2-chat-7b-vllm:
gsm8k: 43.75
race-high: 87.5
llama-3_1-8b-instruct-hf:
gsm8k: 84.38
race-high: 90.62
llama-3_2-3b-instruct-hf:
gsm8k: 65.62
race-high: 81.25
llama-3-8b-instruct-hf:
gsm8k: 68.75
race-high: 87.5
llama-3_1-8b-instruct-turbomind:
gsm8k: 78.12
race-high: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k: 62.50
race-high: 81.25
llama-3-8b-instruct-turbomind:
gsm8k: 68.75
race-high: 87.5
mistral-7b-instruct-v0.2-hf:
gsm8k: 40.62
race-high: 75
mistral-7b-instruct-v0.3-hf:
gsm8k: 40.62
race-high: 75
mistral-nemo-instruct-2407-hf:
gsm8k: 75
race-high: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k: 68.75
race-high: 87.50
mistral-7b-instruct-v0.1-vllm:
gsm8k: 34.38
race-high: 68.75
mistral-7b-instruct-v0.2-vllm:
gsm8k: 43.75
race-high: 75
MiniCPM3-4B-hf:
gsm8k: 68.75
race-high: 84.38
minicpm-2b-dpo-fp32-hf:
gsm8k: 56.25
race-high: 53.12
minicpm-2b-sft-bf16-hf:
gsm8k: 46.88
race-high: 65.62
minicpm-2b-sft-fp32-hf:
gsm8k: 46.88
race-high: 65.62
phi-3-mini-4k-instruct-hf:
gsm8k: 56.25
race-high: 84.38
qwen1.5-0.5b-chat-hf:
gsm8k: 0
race-high: 53.12
qwen2-1.5b-instruct-hf:
gsm8k: 62.5
race-high: 84.38
qwen2-7b-instruct-hf:
gsm8k: 68.75
race-high: 90.62
qwen2-1.5b-instruct-turbomind:
gsm8k: 62.50
race-high: 84.38
qwen2-7b-instruct-turbomind:
gsm8k: 81.25
race-high: 87.5
qwen1.5-0.5b-chat-vllm:
gsm8k: 3.12
race-high: 53.12
yi-1.5-6b-chat-hf:
gsm8k: 65.62
race-high: 84.38
yi-1.5-9b-chat-hf:
gsm8k: 75
race-high: 93.75
deepseek-v2-lite-chat-hf:
gsm8k: 43.75
race-high: 71.88
internlm2_5-20b-chat-hf:
gsm8k: 84.38
race-high: 87.5
internlm2_5-20b-chat-turbomind:
gsm8k: 84.38
race-high: 87.5
mistral-small-instruct-2409-hf:
gsm8k: 81.25
race-high: 87.50
mistral-small-instruct-2409-turbomind:
gsm8k: 78.12
race-high: 87.50
qwen2.5-14b-instruct-hf:
gsm8k: 71.88
race-high: 96.88
qwen2.5-14b-instruct-turbomind:
gsm8k: 71.88
race-high: 93.75
glm-4-9b-hf:
gsm8k: 68.75
GPQA_diamond: 31.25
race-high: 93.75
winogrande: 84.38
deepseek-moe-16b-base-hf:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 21.88
winogrande: 65.62
deepseek-7b-base-turbomind:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 46.88
winogrande: 84.38
deepseek-moe-16b-base-vllm:
gsm8k: 21.88
GPQA_diamond: 0
race-high: 25
winogrande: 68.75
gemma2-2b-hf:
gsm8k: 31.25
GPQA_diamond: 3.12
race-high: 56.25
winogrande: 71.88
gemma2-9b-hf:
gsm8k: 68.75
GPQA_diamond: 0
race-high: 81.25
winogrande: 84.38
gemma-2b-hf:
gsm8k: 18.75
GPQA_diamond: 3.12
race-high: 25
winogrande: 53.12
gemma-7b-hf:
gsm8k: 56.25
GPQA_diamond: 6.25
race-high: 65.62
winogrande: 78.12
gemma-2b-vllm:
gsm8k: 15.62
GPQA_diamond: 6.25
race-high:
winogrande:
gemma-7b-vllm:
gsm8k: 53.12
GPQA_diamond: 6.25
race-high:
winogrande:
internlm2_5-7b-hf:
gsm8k: 37.5
GPQA_diamond: 25
race-high: 93.75
winogrande: 71.88
internlm2-7b-hf:
gsm8k: 53.12
GPQA_diamond: 18.75
race-high: 62.5
winogrande: 78.12
internlm2-base-7b-hf:
gsm8k: 3.12
GPQA_diamond: 21.88
race-high: 75
winogrande: 65.62
internlm2-1.8b-turbomind:
gsm8k: 12.5
GPQA_diamond: 12.5
race-high: 71.88
winogrande: 75
internlm2_5-7b-turbomind:
gsm8k: 68.75
GPQA_diamond: 31.25
race-high: 93.75
winogrande: 84.38
internlm2-7b-turbomind:
gsm8k: 56.25
GPQA_diamond: 21.88
race-high: 75
winogrande: 81.25
internlm2-base-7b-turbomind:
gsm8k: 40.62
GPQA_diamond: 28.12
race-high: 84.38
winogrande: 71.88
llama-2-7b-hf:
gsm8k: 21.88
GPQA_diamond: 21.88
race-high: 40.62
winogrande: 71.88
llama-3_1-8b-hf:
gsm8k: 78.12
GPQA_diamond: 25
race-high: 90.62
winogrande: 62.5
llama-3-8b-hf:
gsm8k: 46.88
GPQA_diamond: 6.25
race-high: 65.62
winogrande: 65.62
llama-3.1-8b-turbomind:
gsm8k: 56.25
GPQA_diamond: 6.25
race-high: 78.12
winogrande: 78.12
llama-3-8b-turbomind:
gsm8k: 50
GPQA_diamond: 9.38
race-high: 65.62
winogrande: 78.12
mistral-7b-v0.2-hf:
gsm8k: 31.25
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 59.38
mistral-7b-v0.3-hf:
gsm8k: 31.25
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 59.38
mistral-7b-v0.2-vllm:
gsm8k: 34.38
GPQA_diamond: 6.25
race-high: 62.5
winogrande: 65.62
qwen2.5-7b-hf:
gsm8k: 81.25
GPQA_diamond: 18.75
race-high: 87.5
winogrande: 71.88
qwen2.5-1.5b-turbomind:
gsm8k: 71.88
GPQA_diamond: 15.62
race-high: 78.12
winogrande: 71.88
qwen2.5-7b-turbomind:
gsm8k: 71.88
GPQA_diamond: 25
race-high: 87.5
winogrande: 71.88
qwen1.5-moe-a2.7b-hf:
gsm8k: 62.5
GPQA_diamond: 18.75
race-high: 84.38
winogrande: 75
qwen2-0.5b-hf:
gsm8k: 25
GPQA_diamond: 0
race-high: 40.62
winogrande: 62.5
qwen2-1.5b-hf:
gsm8k: 59.38
GPQA_diamond: 9.38
race-high: 81.25
winogrande: 62.5
qwen2-7b-hf:
gsm8k: 68.75
GPQA_diamond: 9.38
race-high: 87.5
winogrande: 68.75
qwen2-1.5b-turbomind:
gsm8k: 62.50
GPQA_diamond: 6.25
race-high: 81.25
winogrande: 75
qwen2-7b-turbomind:
gsm8k: 68.75
GPQA_diamond: 12.5
race-high: 87.5
winogrande: 71.88
qwen1.5-0.5b-vllm:
gsm8k: 9.38
GPQA_diamond: 0
race-high: 56.25
winogrande: 62.5
yi-1.5-6b-hf:
gsm8k: 62.5
GPQA_diamond: 3.12
race-high: 87.5
winogrande: 62.5
yi-1.5-9b-hf:
gsm8k: 75
GPQA_diamond: 40.62
race-high: 87.5
winogrande: 59.38
deepseek-v2-lite-hf:
gsm8k: 28.12
GPQA_diamond: 21.88
race-high: 59.38
winogrande: 75
internlm2-20b-hf:
gsm8k: 56.25
GPQA_diamond: 15.62
race-high: 68.75
winogrande: 75
internlm2-base-20b-hf:
gsm8k: 12.5
GPQA_diamond: 9.38
race-high: 84.38
winogrande: 65.62
internlm2-20b-turbomind:
gsm8k: 68.75
GPQA_diamond: 15.62
race-high: 68.75
winogrande: 81.25
qwen2.5-14b-hf:
gsm8k: 75
GPQA_diamond: 37.5
race-high: 93.75
winogrande: 84.38

View File

@ -13,13 +13,33 @@ on:
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
build_lmdeploy:
required: false
description: 'whether to build lmdeploy'
type: boolean
default: false
repo_org_lmdeploy:
required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy'
type: string
default: 'InternLM/lmdeploy'
repo_ref_lmdeploy:
required: false
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
regression_func:
required: true
description: 'regression functions'
type: string
default: "['chat','base','cmd']"
default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
cuda_env:
required: true
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
type: string
default: "['dsw_cu12']"
schedule:
- cron: '56 16 * * *'
- cron: '15 16 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@ -31,7 +51,7 @@ env:
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
@ -39,6 +59,8 @@ env:
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
jobs:
build-pypi:
@ -64,16 +86,51 @@ jobs:
retention-days: 1
name: my-artifact-${{ github.run_id }}
daily_run_test:
build-pypi-lmdeploy:
if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
strategy:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
DOCKER_TAG: cuda12.1
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
- name: Build
run: |
echo ${PYTHON_VERSION}
echo ${PLAT_NAME}
echo ${DOCKER_TAG}
echo ${OUTPUT_FOLDER}
echo ${GITHUB_RUN_ID}
# remove -it
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
if-no-files-found: error
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
retention-days: 1
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
prepare_env:
if: ${{!cancelled()}}
needs: build-pypi
needs: ['build-pypi', 'build-pypi-lmdeploy']
strategy:
fail-fast: false
matrix:
cuda_env: [dsw_cu11, dsw_cu12]
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 600 #10hours
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -84,89 +141,169 @@ jobs:
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Remove Conda Env
if: always()
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
- name: Prepare - create conda env and install torch - cu11
if: ${{matrix.cuda_env == 'dsw_cu11'}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
uses: nick-fields/retry@v3
id: retry1
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12
if: ${{matrix.cuda_env == 'dsw_cu12'}}
uses: nick-fields/retry@v3
id: retry2
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
pip install lmdeploy-*.whl --no-deps
daily_run_test:
if: ${{!cancelled()}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Prepare - prepare data and hf model
run: |
ln -s ${{env.DATEASET_CACHE_PATH}} data
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
- name: Run command testcase
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd')
if: matrix.regression_func == 'cmd'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat')
if: matrix.regression_func == 'chat_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base')
if: matrix.regression_func == 'base_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_obj_fullbench'
run: |
rm -rf regression_result_daily
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test - fullbench
if: matrix.regression_func == 'base_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api
if: matrix.regression_func == 'api'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api kill
if: always() && matrix.regression_func == 'api'
run: |
kill -15 "$restful_pid"
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}

View File

@ -10,17 +10,6 @@ on:
- 'tools/**'
workflow_dispatch:
inputs:
repo_org:
required: false
description: 'Tested repository organization name. Default is open-compass/opencompass'
type: string
default: 'open-compass/opencompass'
repo_ref:
required: false
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
schedule:
- cron: '56 22 * * *'
@ -46,9 +35,6 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Prepare - Install opencompass
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate

View File

@ -57,6 +57,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403

View File

@ -0,0 +1,164 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
from opencompass.datasets.livecodebench import TestOutputPromptConstants
lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'
# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
),
pred_role='BOT',
)
LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg
)
# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)
LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)
# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)
LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)
LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
]

65
configs/eval_babilong.py Normal file
View File

@ -0,0 +1,65 @@
from mmengine.config import read_base
with read_base():
# Models
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
models as lmdeploy_llama3_1_8b_instruct_model,
)
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import (
models as lmdeploy_ministral_8b_instruct_2410_model,
)
# Datasets
from opencompass.configs.datasets.babilong.babilong_0k_gen import (
babiLong_0k_datasets,
)
from opencompass.configs.datasets.babilong.babilong_4k_gen import (
babiLong_4k_datasets,
)
from opencompass.configs.datasets.babilong.babilong_16k_gen import (
babiLong_16k_datasets,
)
from opencompass.configs.datasets.babilong.babilong_32k_gen import (
babiLong_32k_datasets,
)
from opencompass.configs.datasets.babilong.babilong_128k_gen import (
babiLong_128k_datasets,
)
from opencompass.configs.datasets.babilong.babilong_256k_gen import (
babiLong_256k_datasets,
)
from opencompass.configs.summarizers.groups.babilong import (
babilong_summary_groups,
)
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['engine_config']['session_len'] = 1024 * 1024
model['max_seq_len'] = 1024 * 1024
model['engine_config']['tp'] = 4
model['run_cfg']['num_gpus'] = 4
summarizer = dict(
dataset_abbrs=[
'babilong_0k',
'babilong_4k',
'babilong_16k',
'babilong_32k',
'babilong_128k',
'babilong_256k',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)
work_dir = './outputs/babilong'

44
configs/eval_musr.py Normal file
View File

@ -0,0 +1,44 @@
from mmengine.config import read_base
import os.path as osp
with read_base():
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
# from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct_model,
)
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import (
models as lmdeploy_yi_1_5_9b_chat_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
models as lmdeploy_qwen2_5_32b_instruct_model,
)
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import (
models as lmdeploy_glm4_9b_chat_model,
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
models as lmdeploy_llama3_1_8b_instruct_model,
)
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import (
models as lmdeploy_ministral_8b_instruct_2410_model,
)
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import (
models as lmdeploy_gemma_9b_it_model,
)
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import (
models as lmdeploy_gemma_27b_it_model,
)
from opencompass.configs.summarizers.groups.musr_average import summarizer
datasets = [*musr_datasets]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
base_exp_dir = 'outputs/musr/'
work_dir = osp.join(base_exp_dir, 'musr_eval')

View File

@ -1,4 +1,4 @@
__version__ = '0.3.5'
__version__ = '0.3.6'
def _warn_about_config_migration():

View File

@ -0,0 +1,37 @@
# BABILong
OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). BABILong provides an evaluation of long-context reasoning across extremely long documents, including a diverse set of 20 reasoning tasks such as fact chaining, simple induction, deduction, counting, and handling lists/sets. This benchmark is designed to test the ability of language models to reason over facts distributed in long natural text, and it allows for the construction of tasks of almost arbitrary length to adapt to the evaluation of new, more powerful models in an extensible and controllable way.
## How to Use
The BABILong dataset is available on Hugging Face: [RMT-team/babilong](https://huggingface.co/datasets/RMT-team/babilong). Opencompass provides an automatic download for BABILong dataset, due to the dataset size, we only provide the data up to 1M tokens. For longer context, you can download the dataset from Hugging Face directly.
BABILong paper provides in total 20 tasks, we provide 10 tasks configurations in OpenCompass and they are organized by different context sizes. You can create your own configurations by following the examples in `opencompass/configs/datasets/babilong/babilong_1m_gen.py`.
Opencompass provides a demo for evaluating language models on the BABILong dataset.
```bash
opencompass configs/eval_babilong.py
```
OpenCompass provides the results of some models on the BABILong dataset. The evaluation results are run with LMDeploy with default model settings.
| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind |
|----- | ----- | ----- | ----- | ----- | ----- | ----- | -----|
| babilong_0k | - | naive_average | gen | 76.51 | 80.25 | 76.44 | 76.40 |
| babilong_4k | - | naive_average | gen | 67.55 | 70.35 | 67.41 | 67.92 |
| babilong_16k | - | naive_average | gen | 53.78 | 65.83 | 60.26 | 56.58 |
| babilong_32k | - | naive_average | gen | 50.86 | 62.66 | 59.56 | 53.52 |
| babilong_128k | - | naive_average | gen | 39.33 | 27.79 | 52.01 | 3.20 |
| babilong_256k | - | naive_average | gen | 17.31 | 7.30 | 23.35 | 9.50 |
## Citation
```bibtex
@misc{kuratov2024babilong,
title={BABILong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack},
author={Yuri Kuratov and Aydar Bulatov and Petr Anokhin and Ivan Rodkin and Dmitry Sorokin and Artyom Sorokin and Mikhail Burtsev},
year={2024},
eprint={2406.10149},
archivePrefix={arXiv}
}
```

View File

@ -0,0 +1,37 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_0k_datasets = []
split_name='0k'
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_0k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_128k_datasets = []
split_name='128k'
max_seq_len = 128*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_128k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_16k_datasets = []
split_name='16k'
max_seq_len = 16*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_16k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,37 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_1m_datasets = []
split_name='1m'
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_1m_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_256k_datasets = []
split_name='256k'
max_seq_len = 256*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len ),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_256k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_2k_datasets = []
split_name='2k'
max_seq_len = 2*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer max_seq_len=max_seq_len),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_2k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_32k_datasets = []
split_name='32k'
max_seq_len = 32*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_32k_datasets.append(tmp_dataset)

View File

@ -0,0 +1,38 @@
from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
babiLong_4k_datasets = []
split_name='4k'
max_seq_len=4*1024
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']
for task in tasks:
tmp_dataset = {
'abbr': f'babilong_{task}_{split_name}',
'type': BabiLongDataset,
'path': 'opencompass/babilong',
'task': task,
'split_name': split_name,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len),
),
'eval_cfg': dict(
evaluator=dict(type=BabiLongEvaluator),
),
}
babiLong_4k_datasets.append(tmp_dataset)

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403

View File

@ -0,0 +1,164 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
from opencompass.datasets.livecodebench import TestOutputPromptConstants
lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'
# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
),
pred_role='BOT',
)
LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg
)
# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)
LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)
# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024)
)
lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)
LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)
LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
# postprocess v2
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
)
math_datasets = [
dict(
type=MATHDataset,
abbr='math_prm800k_500',
path='opencompass/math',
file_name = 'test_prm800k_500.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,75 @@
# MuSR: Multistep Soft Reasoning Dataset
MuSR (Multistep Soft Reasoning) is a dataset designed to evaluate language models (LLMs) on complex reasoning tasks embedded in natural language narratives. Created to challenge state-of-the-art models like GPT-4 and others, MuSR emphasizes nuanced reasoning across different domains, including social and physical reasoning, commonsense reasoning, and planning, with tasks framed within realistic scenarios such as murder mysteries, object placements, and team allocations.
## Overview
### Purpose
Current large language models can perform complex tasks through prompting techniques like chain-of-thought reasoning. However, robust multistep reasoning remains challenging. MuSR addresses these limitations by evaluating LLM performance on tasks involving multistep reasoning in three domains:
- **Murder Mysteries**: Requires social and physical deductive reasoning.
- **Object Placements**: Tests observational and theory-of-mind reasoning.
- **Team Allocations**: Focuses on social reasoning and constraint satisfaction.
### Dataset Construction
MuSR instances are generated using a neurosymbolic synthetic-to-natural narrative generation algorithm. This approach allows for the creation of complex reasoning instances that combine structured reasoning trees with natural language narratives, challenging both direct and nuanced inference capabilities in LLMs.
MuSR's dataset consists of:
- **Murder Mysteries**: Scenarios with suspects, motives, and opportunities requiring deductive inference.
- **Object Placements**: Scenarios where individuals' observations inform reasoning about object locations.
- **Team Allocations**: Scenarios that simulate social relationships and teamwork for optimal task assignments.
### Dataset Access
MuSR dataset is publicly available, with instructions provided on the [GitHub Project](https://github.com/Zayne-Sprague/MuSR). You can download the dataset and use pre-defined prompts or create your own configurations.
### Evaluation
1. Install dependencies and configure the environment.
2. Run evaluations using `opencompass configs/eval_musr.py` to assess LLM performance.
3. Analyze results against human performance benchmarks.
### Example Command
```bash
opencompass configs/eval_musr.py
```
## Baselines and Results
MuSR includes baseline results for multiple LLMs evaluated with chain-of-thought and advanced reasoning strategies. These benchmarks assess model accuracy on reasoning tasks across the three domains.
| Domain | Baseline Accuracy (GPT-4) | Human Performance |
|------------------|---------------------------|--------------------|
| Murder Mystery | 80.4% | 94.1% |
| Object Placement | 60.9% | 95.0% |
| Team Allocation | 68.4% | 100% |
| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | qwen2.5-14b-instruct-turbomind | yi-1.5-9b-chat-turbomind | qwen2.5-32b-instruct-turbomind | glm-4-9b-chat-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind | gemma-2-9b-it-turbomind | gemma-2-27b-it-turbomind |
|----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----|
| musr_murder_mysteries | a5ce30 | accuracy | gen | 59.20 | 63.20 | 76.00 | 68.80 | 78.80 | 71.20 | 73.60 | 73.60 | 74.80 | 77.20 |
| musr_object_placements | a5ce30 | accuracy | gen | 54.69 | 56.25 | 57.42 | 52.73 | 66.02 | 49.22 | 57.42 | 60.94 | 60.94 | 62.11 |
| musr_team_allocation | a5ce30 | accuracy | gen | 39.20 | 32.40 | 55.60 | 40.00 | 67.60 | 50.40 | 46.00 | 36.40 | 40.80 | 41.20 |
| musr_average | - | naive_average | gen | 51.03 | 50.62 | 63.01 | 53.84 | 70.81 | 56.94 | 59.01 | 56.98 | 58.85 | 60.17 |
## Citation
If you use MuSR in your research, please cite:
```bibtex
@misc{sprague2024musrtestinglimitschainofthought,
title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning},
author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
year={2024},
eprint={2310.16049},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2310.16049},
}
```
## Details
For further details, please refer to the MuSR paper [here](https://arxiv.org/abs/2310.16049).

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .musr_gen_3c6e15 import musr_datasets # noqa: F401, F403

View File

@ -0,0 +1,135 @@
from opencompass.datasets import MusrDataset, MusrEvaluator
from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
DATASET_CONFIGS = {
'murder_mysteries': {
'abbr': 'musr_murder_mysteries',
'name': 'murder_mysteries',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
'object_placements': {
'abbr': 'musr_object_placements',
'name': 'object_placements',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
'team_allocation': {
'abbr': 'musr_team_allocation',
'name': 'team_allocation',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
}
musr_datasets = []
for config in DATASET_CONFIGS.values():
dataset = dict(
abbr=config['abbr'],
type=MusrDataset,
path=config['path'],
name=config['name'],
reader_cfg=config['reader_cfg'],
infer_cfg=config['infer_cfg'],
eval_cfg=config['eval_cfg'],
)
musr_datasets.append(dataset)

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='glm-4-9b-turbomind',
path='THUDM/glm-4-9b',
engine_config=dict(max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=8192,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-14b-turbomind',
path='Qwen/Qwen2.5-14B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-32b-turbomind',
path='Qwen/Qwen2.5-32B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,17 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-72b-turbomind',
path='Qwen/Qwen2.5-72B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024
),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='yi-1.5-9b-turbomind',
path='01-ai/Yi-1.5-9B',
engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=4096,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,37 @@
default_babilong_tasks = [
'qa1',
'qa2',
'qa3',
'qa4',
'qa5',
'qa6',
'qa7',
'qa8',
'qa9',
'qa10',
]
context_window_sizes = [
'0k',
'1k',
'2k',
'4k',
'8k',
'16k',
'32k',
'64k',
'128k',
'256k',
'512k',
'1m',
]
babilong_summary_groups = []
for context_window_size in context_window_sizes:
babilong_summary_groups.append(
{
'name': f'babilong_{context_window_size}',
'subsets': [
f'babilong_{task}_{context_window_size}'
for task in default_babilong_tasks
],
}
)

View File

@ -0,0 +1,19 @@
summarizer = dict(
dataset_abbrs=[
'musr_murder_mysteries',
'musr_object_placements',
'musr_team_allocation',
'musr_average'
],
summary_groups=[
{
'name': 'musr_average',
'subsets': [
'musr_murder_mysteries',
'musr_object_placements',
'musr_team_allocation',
],
}
],
)

View File

@ -7,6 +7,7 @@ from .anthropics_evals import * # noqa: F401, F403
from .apps import * # noqa: F401, F403
from .arc import * # noqa: F401, F403
from .ax import * # noqa: F401, F403
from .babilong import * # noqa: F401, F403
from .bbh import * # noqa: F401, F403
from .boolq import * # noqa: F401, F403
from .bustum import * # noqa: F401, F403
@ -86,6 +87,7 @@ from .mmlu_pro import * # noqa: F401, F403
from .MMLUArabic import * # noqa: F401, F403
from .mmmlu import * # noqa: F401, F403
from .multirc import * # noqa: F401, F403
from .musr import * # noqa: F401, F403
from .narrativeqa import * # noqa: F401, F403
from .natural_question import * # noqa: F401, F403
from .natural_question_cn import * # noqa: F401, F403

View File

@ -0,0 +1 @@
from .babilong import * # noqa: F401, F403

View File

@ -0,0 +1,106 @@
# flake8: noqa: F401, E501
import json
import os
from datasets import Dataset
from opencompass.datasets.babilong.babilong_utils import compare_answers
from opencompass.datasets.babilong.prompts import (DEFAULT_PROMPTS,
DEFAULT_TEMPLATE,
get_formatted_input)
from opencompass.datasets.base import BaseDataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
@LOAD_DATASET.register_module()
class BabiLongDataset(BaseDataset):
@staticmethod
def load(
path,
task,
split_name,
use_instruction=True,
use_examples=True,
use_post_prompt=True,
) -> Dataset:
assert task in [
'qa1',
'qa2',
'qa3',
'qa4',
'qa5',
'qa6',
'qa7',
'qa8',
'qa9',
'qa10',
], f"Task must be in ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']"
assert split_name in [
'0k',
'1k',
'2k',
'4k',
'8k',
'16k',
'32k',
'64k',
'128k',
'256k',
'512k',
'1m',
], f"Split name must be in ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '256k', '512k', '1m']"
# configure the prompt
prompt_cfg = {
'instruction':
(DEFAULT_PROMPTS[task]['instruction'] if use_instruction else ''),
'examples':
(DEFAULT_PROMPTS[task]['examples'] if use_examples else ''),
'post_prompt':
(DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else ''),
'template':
DEFAULT_TEMPLATE,
}
path = get_data_path(path)
file = os.path.join(path, task, f'{split_name}.json')
with open(file, 'r') as f:
task_data = json.load(f)
data = []
for sample in task_data:
tmp_data = {'prompt': [], 'answer': []}
target = sample['target']
context = sample['input']
question = sample['question']
input_text = get_formatted_input(
context,
question,
prompt_cfg['examples'],
prompt_cfg['instruction'],
prompt_cfg['post_prompt'],
template=DEFAULT_TEMPLATE,
)
tmp_data['prompt'].append(input_text)
tmp_data['answer'].append(target)
data.append(tmp_data)
return Dataset.from_list(data)
class BabiLongEvaluator(BaseEvaluator):
def score(self, predictions, gold):
assert len(predictions) == len(gold)
score = (sum([
compare_answers(str(ref[0]), pred)
for pred, ref in zip(predictions, gold)
]) / len(predictions) * 100)
result = {'score': round(score, 2)}
return result

View File

@ -0,0 +1,293 @@
# flake8: noqa: E501
# Modifided from https://github.com/booydar/babilong/blob/main/babilong/babilong_utils.py
import re
import nltk
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
def compare_answers(target, output):
"""Compare target and output answers.
Takes only the first sentence from output and filters responses when model
tries to generate examples. We consider prediction correct if target is in
output.
"""
target = target.lower()
output = output.lower()
# take only the first sentence from output
output = output.split('.')[0]
# filter responses when model tries to generate examples
output = output.split('<context>')[0]
output = output.split('<example>')[0]
# we consider prediction correct if target is in output
if target in output:
return True
return False
def get_dataset_df(dataset_path, max_n_facts=None):
"""Preprocess babi text files."""
with open(dataset_path, 'r') as f:
texts = f.read().strip()
texts = texts.split('\n')
df = pd.DataFrame(texts, columns=['text'])
# parse samples
df['phrase_num'] = df.text.apply(lambda x: int(x.split(' ')[0]))
df.text = df.text.apply(lambda x: x[x.index(' ') + 1:])
df['answer'] = df.text.apply(lambda x: x[x.index('\t') + 1:]
if '\t' in x else None)
df['reference_num'] = df.answer.apply(
lambda x: x
if x is None else [int(n) for n in re.split('\t| ', x)[1:]])
df.answer = df.answer.apply(lambda x: x if x is None else x.split('\t')[0])
df.text = df.text.apply(lambda x: x.split('\t')[0] if '\t' in x else x)
# mark each sample
sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
for i, (start,
end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
df.loc[start:end, 'initial_sample_num'] = i
df.initial_sample_num = df.initial_sample_num.astype(int)
# multiple questions in sample -> samples with single question
initial_samples = [
df[df.initial_sample_num == sn]
for sn in df.initial_sample_num.unique()
]
single_question_slices = []
for sample in initial_samples:
answer_positions = sample[~sample.answer.isna()].index
slices = [sample.loc[:ans_pos].copy() for ans_pos in answer_positions]
for i, slc in enumerate(slices):
slices[i] = slc[(slc.answer.isna()) | (slc.index == slc.index[-1])]
if max_n_facts is not None: # drop samples with too many facts
slices = [slc for slc in slices if slc.shape[0] <= max_n_facts]
single_question_slices += slices
df = pd.concat(single_question_slices).reset_index(drop=True)
# mark each sample again
sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
for i, (start,
end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
df.loc[start:end, 'sample_num'] = i
df.sample_num = df.sample_num.astype(int)
return df
class TaskDataset(Dataset):
"""Babi task loader dataset."""
def __init__(self, dataset_path, max_n_facts=None):
self.fact_dataset = get_dataset_df(dataset_path,
max_n_facts=max_n_facts)
def __getitem__(self, ind):
slc = self.fact_dataset[self.fact_dataset.sample_num == ind]
references = slc[slc.phrase_num.isin(
slc.reference_num.values[-1])].text.values
sample = {
'facts': slc.text.values[:-1],
'question': slc.text.values[-1],
'answer': slc.answer.values[-1],
'references': references,
}
return sample
def __len__(self):
return self.fact_dataset.sample_num.max()
def sum_lengths(sentences):
return sum([len(s) for s in sentences])
class SentenceSampler:
"""Sampler of background text."""
def __init__(
self,
dataset,
tokenizer,
min_sentence_len=10,
max_sentence_len=None,
shuffle=False,
random_seed=42,
):
self.sample_ind = 0
self.dataset = dataset
self.sentences = []
self.tokenizer = tokenizer
self.min_sentence_len = min_sentence_len
self.max_sentence_len = max_sentence_len
self.sentence_tokenizer = nltk.PunktSentenceTokenizer()
self.shuffle = shuffle
self.gen = np.random.default_rng(seed=random_seed)
def get_sample(self, sample_size):
sample = []
total_len = 0
while True:
sentences = list(self.sentences)
for i, sent in enumerate(
sentences
): # add new sentence until sample_size is reached
tokenized = self.tokenizer.encode(sent,
add_special_tokens=False)
if not self.length_is_ok(tokenized):
continue
total_len += len(tokenized)
sample.append(tokenized)
if total_len >= sample_size:
self.sentences = self.sentences[i + 1:]
cutoff = total_len - sample_size
if cutoff > 0:
sample[-1] = sample[-1][:-cutoff]
return sample
self.sentences = []
self.sample_sentences_(
sample_size
) # appends new sentences, can be updated to just return new sentences
def sample_sentences_(self, sample_size):
sentences = []
while len(sentences) == 0:
text = self.next_sample_()
if self.shuffle:
if len(text) == 0:
continue
text = text[self.gen.choice(len(
text)):] # start from random position in text
text = text[:sample_size *
10] # cut too long texts to speed up tokenization
sentences += self.sentence_tokenizer.tokenize(text)
if self.shuffle:
sentences = sentences[1:-1]
self.sentences += sentences
def next_sample_(self):
if self.shuffle:
self.total_tokens = 0
sample_ind = self.gen.choice(len(self.dataset))
sample = self.dataset[int(sample_ind)]['text']
else:
sample = self.dataset[int(self.sample_ind)]['text']
self.sample_ind += 1
self.sample_ind = self.sample_ind % len(self.dataset)
return sample
def length_is_ok(self, tokenized):
if (self.max_sentence_len is not None
and len(tokenized) > self.max_sentence_len):
return False
if (self.min_sentence_len is not None
and len(tokenized) < self.min_sentence_len):
return False
return True
class NoiseInjectionDataset(Dataset):
"""Combined dataset for noisy babi QA.
It's recommended to use sample_size >= 1024 and task_end_pct - task_start_pct >= 0.2
"""
def __init__(
self,
task_dataset,
noise_sampler,
tokenizer,
task_start_pct=None, # left border of facts in sample, between 0 and 1
task_end_pct=None, # right border of facts in sample, between task_start_pct and 1
sample_size=1024,
mixed_length_ratio=0.0, # used for mixed length curriculum, prob for shorter samples
random_seed=42,
):
self.task_dataset = task_dataset
self.noise_sampler = noise_sampler
self.sample_size = sample_size
self.mixed_length_ratio = mixed_length_ratio
self.tokenizer = tokenizer
self.task_start_pct = task_start_pct
self.task_end_pct = task_end_pct
if random_seed:
self.gen = np.random.default_rng(seed=random_seed)
def __getitem__(self, ind):
sample = self.task_dataset[ind]
facts_tok = self.tokenizer(list(sample['facts']))['input_ids']
question_tok = self.tokenizer(sample['question'])['input_ids']
answer_tok = self.tokenizer(sample['answer'])['input_ids']
sample_size = self.get_sample_size()
task_len = sum_lengths(facts_tok)
background_text_len = sample_size - task_len
background_text = self.noise_sampler.get_sample(background_text_len)
sample['background_text'] = background_text
if (self.task_start_pct is None
and self.task_end_pct is None): # if fact position unspecified
possible_positions = range(len(background_text) + 1)
else:
task_start_ind = int(sample_size * self.task_start_pct)
task_end_ind = int(sample_size * self.task_end_pct)
total_facts_len = sum_lengths(facts_tok)
possible_positions = [] # where can we insert facts?
current_length = 0
for i, text in enumerate(background_text):
if (current_length >= task_start_ind) and (
current_length < task_end_ind - total_facts_len):
possible_positions.append(i)
current_length += len(text)
if len(possible_positions) == 0:
raise IndexError(
f'Unable to insert facts in specified place: {self.task_start_pct, self.task_end_pct}.'
f'Total fact length: {total_facts_len}, '
f'sentences length: {[len(t) for t in background_text]}. '
f'Make the range wider or increase the sample size.')
fact_positions = self.gen.choice(possible_positions, len(facts_tok))
fact_positions.sort()
sample['fact_positions'] = (
fact_positions # positions of facts between noise sentences
)
updated_sample = [[] for _ in range(len(background_text) + 1)]
for fact, pos in zip(facts_tok, fact_positions):
updated_sample[pos].append(fact)
for i, s in enumerate(background_text):
updated_sample[i].append(s)
flat = [i for s in updated_sample for i in s]
tokens = [i for s in flat for i in s]
sample['input_tokens'] = tokens
sample['question_tokens'] = question_tok
sample['target_tokens'] = answer_tok
return sample
def __len__(self):
return len(self.task_dataset)
def get_sample_size(self):
if isinstance(self.sample_size, list):
if self.gen.random() > self.mixed_length_ratio:
return self.gen.choice(self.sample_size)
return max(self.sample_size)
else:
return self.sample_size

View File

@ -0,0 +1,516 @@
# flake8: noqa: E501
SYSTEM_TEMPLATE = '{instruction}\n\n{examples}\n\n{post_prompt}'
USER_TEMPLATE = '<context>\n{context}\n</context>\n\nQuestion: {question}'
DEFAULT_TEMPLATE = f'{SYSTEM_TEMPLATE}\n\n{USER_TEMPLATE}'
CUSTOM_SYSTEM_PROMPTS = {
# https://github.com/dvlab-research/LongLoRA/blob/2345c6d030f61ac3a031906386a103a5b05e0e6f/inference.py#L18
'LONGLORA_LLAMA2':
'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. '
'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '
'Please ensure that your responses are socially unbiased and positive in nature.\n\n'
'If a question does not make any sense, or is not factually coherent, explain why instead of answering '
'something not correct. If you don\'t know the answer to a question, please don\'t share false information.'
}
def get_formatted_input(
context,
question,
examples,
instruction,
post_prompt,
template=DEFAULT_TEMPLATE,
):
# pre_prompt - general instruction
# examples - in-context examples
# post_prompt - any additional instructions after examples
# context - text to use for qa
# question - question to answer based on context
formatted_input = template.format(
instruction=instruction,
examples=examples,
post_prompt=post_prompt,
context=context.strip(),
question=question,
)
return formatted_input.strip()
DEFAULT_PROMPTS = {
'qa1': {
'instruction':
'I will give you context with the facts about positions of different persons hidden in some random text '
'and a question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location to answer the question.',
'examples':
'<example>\n'
'Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. '
'Where is Charlie?\n'
'Answer: The most recent location of Charlie is balcony.\n'
'</example>\n\n'
'<example>\n'
'Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse '
'travelled to balcony. Where is Alan?\n'
'Answer: The most recent location of Alan is shop.\n'
'</example>',
'post_prompt':
'Always return your answer in the following format: '
'The most recent location of person is location. Do not write anything else after that.',
},
'qa2': {
'instruction':
'I give you context with the facts about locations and actions of different persons '
'hidden in some random text and a question.'
'You need to answer the question based only on the information from the facts.\n'
'If a person got an item in the first location and travelled to the second location '
'the item is also in the second location. '
'If a person dropped an item in the first location and moved to the second location '
'the item remains in the first location.',
'examples':
'<example>\n'
'Charlie went to the kitchen. Charlie got a bottle. Charlie moved to the balcony. '
'Where is the bottle?\n'
'Answer: The bottle is in the balcony.\n'
'</example>\n'
'<example>\n'
'Alan moved to the garage. Alan got a screw driver. Alan moved to the kitchen. Where '
'is the screw driver?\n'
'Answer: The screw driver is in the kitchen.\n'
'</example>',
'post_prompt':
'Always return your answer in the following format: The item is in location. '
'Do not write anything else after that.',
},
'qa3': {
'instruction':
'I give you context with the facts about locations and actions of different persons '
'hidden in some random text and a question. '
'You need to answer the question based only on the information from the facts.\n'
'If a person got an item in the first location and travelled to the second location '
'the item is also in the second location. '
'If a person dropped an item in the first location and moved to the second location '
'the item remains in the first location.',
'examples':
'<example>\n'
'John journeyed to the bedroom. Mary grabbed the apple. Mary went back to the bathroom. '
'Daniel journeyed to the bedroom. Daniel moved to the garden. Mary travelled to the kitchen. '
'Where was the apple before the kitchen?\n'
'Answer: Before the kitchen the apple was in the bathroom.\n'
'</example>\n'
'<example>\n'
'John went back to the bedroom. John went back to the garden. John went back to the kitchen. '
'Sandra took the football. Sandra travelled to the garden. Sandra journeyed to the bedroom. '
'Where was the football before the bedroom?\n'
'Answer: Before the bedroom the football was in the garden.\n'
'</example>',
'post_prompt':
'Always return your answer in the following format: '
'Before the $location_1$ the $item$ was in the $location_2$. Do not write anything else after that.',
},
'qa4': {
'instruction':
'I will give you context with the facts about different people, their location and actions, hidden in '
'some random text and a question. '
'You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'The hallway is south of the kitchen. The bedroom is north of the kitchen. '
'What is the kitchen south of?\n'
'Answer: bedroom\n'
'</example>\n'
'<example>\n'
'The garden is west of the bedroom. The bedroom is west of the kitchen. What is west of the bedroom?\n'
'Answer: garden\n'
'</example>',
'post_prompt':
'Your answer should contain only one word - location. Do not write anything else after that.',
},
'qa5': {
'instruction':
'I will give you context with the facts about locations and their relations hidden in some random text '
'and a question. You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. '
'Bill took the milk there. Who did Mary give the apple to?\n'
'Answer: Fred\n'
'</example>\n'
'<example>\n'
'Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. '
'Bill travelled to the bedroom. Who gave the football?\n'
'Answer: Jeff\n'
'</example>\n'
'<example>\n'
'Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. '
'Jeff went back to the garden. What did Fred give to Bill?\n'
'Answer: apple\n'
'</example>',
'post_prompt':
'Your answer should contain only one word. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa6': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'John travelled to the hallway. John travelled to the garden. Is John in the garden?\n'
'Answer: yes\n'
'</example>\n'
'<example>\n'
'Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. '
'Sandra went to the garden. Is Mary in the office?\n'
'Answer: no\n'
'</example>\n',
'post_prompt':
'Your answer should contain only one word - $yes$ or $no$. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa7': {
'instruction':
'I will give you context with the facts about people and objects they carry, hidden in some random text '
'and a question. You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'Daniel went to the bedroom. Daniel got the apple there. How many objects is Daniel carrying?\n'
'Answer: one\n'
'</example>\n'
'<example>\n'
'Mary grabbed the apple there. Mary gave the apple to John. How many objects is Mary carrying?\n'
'Answer: none\n'
'</example>\n'
'<example>\n'
'Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. '
'Mary travelled to the garden. How many objects is Sandra carrying?\n'
'Answer: two\n'
'</example>\n',
'post_prompt':
'Your answer should contain only one word - $none$ or $number_of_objects$. '
'Do not write anything else after that. Do not explain your answer.',
},
'qa8': {
'instruction':
'I will give you context with the facts about people and objects they carry, hidden in some random text '
'and a question. You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'Sandra travelled to the garden. Mary grabbed the milk there. What is Mary carrying?\n'
'Answer: milk\n'
'</example>\n'
'<example>\n'
'Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. '
'Sandra discarded the milk there. What is Sandra carrying?\n'
'Answer: nothing\n'
'</example>\n'
'<example>\n'
'Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. '
'Daniel grabbed the milk there. Mary went to the kitchen. What is Daniel carrying?\n'
'Answer: apple,milk\n'
'</example>\n',
'post_prompt':
'Your answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. '
'Do not write anything else. Do not explain your answer.',
},
'qa9': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and '
'a question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'John is not in the bathroom. Sandra is not in the bedroom. Is John in the bathroom?\n'
'Answer: no\n'
'</example>\n'
'<example>\n'
'Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden. '
'Is Mary in the kitchen?\n'
'Answer: yes\n'
'</example>\n',
'post_prompt':
'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
'Do not explain your answer.',
},
'qa10': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Bill is in the kitchen. Julie is either in the school or the cinema. Is Bill in the bedroom?\n'
'Answer: no\n'
'</example>\n'
'<example>\n'
'Fred is in the bedroom. Mary is either in the school or the cinema. Is Mary in the school?\n'
'Answer: maybe\n'
'</example>\n'
'<example>\n'
'Fred is either in the kitchen or the park. Bill moved to the cinema. Is Bill in the cinema?\n'
'Answer: yes\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - $yes$ or $no$ or $maybe$. Do not write anything else. '
'Do not explain your answer.',
},
'qa11': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Daniel journeyed to the hallway. After that he journeyed to the garden. Where is Daniel?\n'
'Answer: garden\n'
'</example>\n'
'<example>\n'
'Mary moved to the office. Afterwards she journeyed to the kitchen. Daniel went to the hallway. '
'Then he journeyed to the garden. Where is Mary?\n'
'Answer: kitchen\n'
'</example>\n'
'<example>\n'
'Sandra moved to the kitchen. After that she went back to the hallway. Sandra moved to the bedroom. '
'Then she went to the hallway. Mary moved to the bedroom. Afterwards she travelled to the bathroom. '
'Where is Sandra?\n'
'Answer: hallway\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - location. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa12': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Mary and Daniel travelled to the bathroom. John and Daniel travelled to the office. Where is Daniel?\n'
'Answer: office\n'
'</example>\n'
'<example>\n'
'Sandra and Mary went back to the office. Daniel and Sandra went to the bedroom. Sandra and Mary travelled to the hallway. '
'John and Mary went to the kitchen. Where is Mary?\n'
'Answer: kitchen\n'
'</example>\n'
'<example>\n'
'Daniel and Sandra went back to the hallway. Daniel and John moved to the office. Daniel and John moved to the garden. '
'Daniel and Mary went back to the bathroom. Daniel and John went back to the kitchen. Daniel and Sandra went to the bathroom. '
'Where is John?\n'
'Answer: kitchen\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - location. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa13': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Mary and Daniel travelled to the bathroom. Then they journeyed to the hallway. Where is Daniel?\n'
'Answer: hallway\n'
'</example>\n'
'<example>\n'
'Daniel and Sandra travelled to the kitchen. After that they journeyed to the hallway. Mary and Daniel travelled to the bedroom. '
'After that they travelled to the hallway. Where is Sandra?\n'
'Answer: hallway\n'
'</example>\n'
'<example>\n'
'John and Mary moved to the bathroom. Then they travelled to the office. John and Mary went to the kitchen. '
'Afterwards they went to the bedroom. John and Sandra moved to the bathroom. Following that they went back to the kitchen. '
'Where is Mary?\n'
'Answer: bedroom\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - location. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa14': {
'instruction':
'I will give you context with the facts about people and their locations hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Bill went back to the cinema yesterday. Julie went to the school this morning. Fred went to the park yesterday. '
'Yesterday Julie went to the office. Where was Julie before the school?\n'
'Answer: office\n'
'</example>\n'
'<example>\n'
'This morning Fred went to the kitchen. Fred journeyed to the bedroom yesterday. Mary travelled to the bedroom this morning. '
'Yesterday Mary went to the cinema. Where was Mary before the bedroom?\n'
'Answer: cinema\n'
'</example>\n'
'<example>\n'
'Yesterday Julie went back to the park. Julie went to the bedroom this morning. Bill journeyed to the cinema yesterday. '
'This morning Bill went back to the park. This evening Julie went to the school. This afternoon Julie went back to the park. '
'Where was Julie before the bedroom?\n'
'Answer: park\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - location. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa15': {
'instruction':
'I will give you context with the facts about animals, their names and relations. The facts and a question '
'are hidden in some random text. You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '
'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '
'What is gertrude afraid of?\n'
'Answer: wolf\n'
'</example>\n'
'<example>\n'
'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '
'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '
'What is jessica afraid of?\n'
'Answer: cat\n'
'</example>\n'
'<example>\n'
'Mice are afraid of cats. Wolves are afraid of sheep. Emily is a wolf. '
'Cats are afraid of sheep. Gertrude is a wolf. Sheep are afraid of cats. Winona is a wolf. '
'What is emily afraid of?\n'
'Answer: sheep\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - an animal species. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa16': {
'instruction':
'I will give you context with the facts about animals, their names and colors. The facts and a question '
'are hidden in some random text. You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'Lily is a frog. Bernhard is a frog. Bernhard is green. Brian is a lion. Brian is white. '
'Julius is a swan. Julius is green. Lily is green. Greg is a swan. What color is Greg?\n'
'Answer: green\n'
'</example>\n'
'<example>\n'
'Julius is a lion. Lily is a rhino. Bernhard is a swan. Lily is white. Bernhard is green. '
'Greg is a rhino. Greg is gray. Julius is white. Brian is a lion. What color is Brian?\n'
'Answer: white\n'
'</example>\n'
'<example>\n'
'Brian is a rhino. Julius is a lion. Bernhard is a lion. Greg is a swan. Brian is gray. '
'Greg is white. Lily is a rhino. Bernhard is yellow. Lily is gray. What color is Julius?\n'
'Answer: yellow\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - a color. Do not write anything else after that. '
'Do not explain your answer.',
},
'qa17': {
'instruction':
'I will give you context with the facts about different figures, their location and colors, hidden in '
'some random text and a question. '
'You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'The triangle is above the pink rectangle. The blue square is to the left of the triangle. '
'Is the pink rectangle to the right of the blue square?\n'
'Answer: yes\n'
'</example>\n'
'<example>\n'
'The red sphere is to the left of the yellow square. The red sphere is below the pink rectangle. '
'Is the pink rectangle to the left of the yellow square?\n'
'Answer: yes\n'
'</example>'
'<example>\n'
'The red sphere is above the pink rectangle. The red sphere is to the right of the red square. '
'Is the pink rectangle above the red square?\n'
'Answer: no\n'
'</example>',
'post_prompt':
'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
'Do not explain your answer.',
},
'qa18': {
'instruction':
'I will give you context with the facts about different objects and their sizes, hidden in '
'some random text and a question. '
'You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'The box of chocolates fits inside the chest. The box is bigger than the chest. The box is bigger than the suitcase. '
'The suitcase fits inside the box. The container is bigger than the box of chocolates. Does the box fit in the box of chocolates?\n'
'Answer: no\n'
'</example>\n'
'<example>\n'
'The suitcase is bigger than the container. The container fits inside the box. The chest is bigger than the chocolate.'
'The suitcase fits inside the box. The chest fits inside the box. Does the chocolate fit in the box?\n'
'Answer: yes\n'
'</example>'
'<example>\n'
'The chocolate fits inside the box of chocolates. The suitcase fits inside the box. The chocolate fits inside the box. '
'The box is bigger than the box of chocolates. The suitcase is bigger than the box of chocolates. Is the chocolate bigger than the box?\n'
'Answer: no\n'
'</example>',
'post_prompt':
'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '
'Do not explain your answer.',
},
'qa19': {
'instruction':
'I will give you context with the facts about different places and their locations, hidden in '
'some random text and a question. '
'You need to answer the question based only on the information from the facts.',
'examples':
'<example>\n'
'The office is east of the hallway. The kitchen is north of the office. The garden is west of the bedroom. '
'The office is west of the garden. The bathroom is north of the garden. How do you go from the kitchen to the garden?\n'
'Answer: s,e\n'
'</example>\n'
'<example>\n'
'The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. '
'The kitchen is north of the bathroom. The hallway is west of the garden. How do you go from the kitchen to the hallway?\n'
'Answer: n,w\n'
'</example>\n'
'<example>\n'
'The bedroom is south of the hallway. The bathroom is east of the office. The kitchen is west of the garden. '
'The garden is south of the office. The office is south of the bedroom. How do you go from the garden to the bedroom?\n'
'Answer: n,n\n'
'</example>\n',
'post_prompt':
'Your answer should contain only two letters, separated by a comma - ordinal directions. You can choose the letters from '
'$n$, $s$, $e$ and $w$. Do not write anything else after that.',
},
'qa20': {
'instruction':
'I will give you context with the facts about people, their locations and condition hidden in some random text and a '
'question. You need to answer the question based only on the information from the facts. '
'If a person was in different locations, use the latest location the person was in to answer the question.',
'examples':
'<example>\n'
'Sumit is tired. Where will sumit go?\n'
'Answer: bedroom\n'
'</example>\n'
'<example>\n'
'Yann is hungry. Yann journeyed to the kitchen. Why did yann go to the kitchen?\n'
'Answer: hungry\n'
'</example>\n'
'<example>\n'
'Antoine is thirsty. Yann is tired. Yann went back to the bedroom. Yann picked up the pajamas there.'
'Jason is thirsty. Antoine went back to the kitchen. Why did antoine go to the kitchen?\n'
'Answer: thirsty\n'
'</example>\n'
'<context>\n',
'post_prompt':
'Your answer should contain only one word - a person condition or a place. Do not write anything else after that. '
'Do not explain your answer.',
},
}

View File

@ -1,4 +1,5 @@
import json
import os
import re
from os import environ
@ -140,7 +141,7 @@ def extract_answer(response_text: str):
class MATHDataset(BaseDataset):
@staticmethod
def load(path: str):
def load(path: str, file_name: str = 'math.json'):
path = get_data_path(path)
dataset = DatasetDict()
raw_data = []
@ -155,7 +156,8 @@ class MATHDataset(BaseDataset):
extract_boxed_answer(item['solution'])
})
else:
data = json.load(open(path))
file_path = os.path.join(path, file_name)
data = json.load(open(file_path))
for i in data.keys():
raw_data.append({
'problem':

View File

@ -0,0 +1 @@
from .musr import * # noqa: F401, F403

View File

@ -0,0 +1,81 @@
# flake8: noqa: E501
story = """
In the smoke-filled haze of a thriving jazz club, Alice met her explosive end, leaving Detective Winston to sift through the suspects: Eugene, the shy pianist, and Gabrielle, the sassy club singer.
While seated at his desk at the precinct, Winston received a phone call from a certain observant local bartender, tipping off the police about a harsh row peaking in a nearby jazz club. He signaled to his partner as they promptly dispatched to the scene, already ringing with sirens and a restless crowd.
With the police line restraining the horde, the jazz club was undergoing a full round-up as Winston approached the informative bartender. The bartender was engrossed in his account to the officers about a raucous, punch throwing fight Eugene was part of, to his best recollection. Winston remembered Eugene, a jazz fanaticlurking around the jazz corners more often than anyone else could recount.
In the heart of the upheaval, lay a woman sprawled on the floor, later identified as Alice, a frequent face at the jazz scene and a financial analyst deeply engrossed in financial transactions. In public, Alice had made her concerns known about her discovery of fraudulent transactions at the bank, promising to report the same to the authorities. Eugene, remembered conspicuously for being a bank teller at the same bank Alice worked at, suddenly seemed closely linked.
Eugenes arrest was far from hushed, with the local news broadcasting the progressing drama live, catching sight of Eugene curtailed in handcuffs. Concurrently, it was ascertainedEugene was a member of the jazz club. This evidence backed by a jazz club membership card retrieved from his wallet during the arrest.
Just a few steps away, he noticed a man in a suit, the bouncer, a calm figure amid the bedlam. In their conversation, the bouncer corroborated that he had indeed seen Eugene involved in a heated scuffle, landing a few punches. The whisperings were starting to gain momentum, since Eugene was believed to be on the losing end of a lawsuita battle courtesy of Alice charging Eugene with the financial fraud she had publicly vowed to expose.
Eugene was known for his frequent presence at the jazz club and on top of that, was an actual member. Therefore, it was hardly a leap to presume Alice meeting her untimely end at the club was no mere happenstance. The jazz club, despite its dim lights and pulsating music, was a public place easily accessible to all, including potential suspects like Eugene and, sadly, the ill-starred Alice.
Det. Winston knew he was now tasked with a cryptic puzzle. A bank teller, embroiled in suspected fraud and a lawsuit, a jazz club murder scene and a local financial analystall woven into a ghastly murder mystery. He sighed in distaste as Eugene was escorted awaya man still oblivious to the chain of events waiting for him. But Winston knew, the night had only just begun for him.
Winston stared down at the crumpled microphone on the floor. He picked it up gingerly, turning it in his hand. The club was in disarray, debris scattered like confetti. The lab boys were still picking pieces of the grenade apart.
"Gabrielle's microphone," the coroner confirmed, barely looking up from his task.
"Give him the once-over for evidence," Winston said, handing the microphone to a nearby officer.
Leaving the club behind him, Winston sighed heavily. The world of jazz had taken a dark turn that night. Alice, the acclaimed critic with her sarcastic wit and keen critical eye, had been last seen alive here. Her purse lay in the club untouched, a testament to the abruptness of the event.
Gabrielle had been working as a war correspondent. Winston had read her articles. They were richly detailed, passionate, and highlighted the harsh reality of war zones. Gabrielle hadn't been shy about sharing her experiences or publicly criticizing the military in her pieces. She boldly interviewed military personnel and spent extended periods in conflict zones.
Alice, though, never missed a chance to pick apart Gabrielle's articles. The vitriolic snippets in Alices column were regular features and Gabrielle's staunch defense of her articles, her work in the jazz scene, did little against Alice's respected reputation.
The tension between them was palpable. Alice had been awarded a major journalist award that Gabrielle had desired. This only deepened their rivalry, with Gabrielle feeling overlooked for this recognition in the Jazz scene.
Winston cast his gaze over the club once morea hub of pulsating rhythms now eerily silent.
A significant part of the evening was Gabrielle's recorded interview with Alice. It played on the local radio, their professional rivalry subtly echoing under their professional demeanor.
With a deep breath, Winston knew he had a tall task ahead. The jazz club, where Alice was last seen alive was now shrouded in an eerie silence, the vibrant rhythms of what used to be a lively night echoing in the abandoned stage. It was up to him to piece together the missing notes and bring the symphony of this unsolved case to a satisfying finale.
Who is the most likely murderer?
Pick one of the following choices:
1 - Eugene
2 - Gabrielle
You must pick one option. Before selecting a choice, explain your reasoning step by step. The murderer needs to have a means (access to weapon), motive (reason to kill the victim), and opportunity (access to crime scene) in order to have killed the victim. Innocent suspects may have two of these proven, but not all three. An innocent suspect may be suspicious for some other reason, but they will not have all of motive, means, and opportunity established.
If you believe that both suspects have motive, means, and opportunity, you should make an educated guess pick the one for whom these are best established. If you believe that neither suspect has all three established, then choose the suspect where these are most clearly established. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
""".strip()
reasoning = """
Let's break this down step-by-step by first deducing which of the two suspects has a means, motive, and opportunity.
We will start with Eugene.
Eugene was being sued by Alice for fraudulent transactions. The charge was also very public. Both of these facts point to Eugene having a strong motive.
Because Eugene has a jazz club membership, and we can deduce that the jazz club membership belongs to the same club Alice was murdered in, we can assume Eugene has an opportunity to commit the crime.
Although we know Eugene is aggressive because he was throwing punches in the story, we do not know if he has access to the murder weapon. Because he does not have access to a grenade, he does not have a means.
Let's review Gabrielle next.
Gabrielle's purse was found at the scene of the crime, and we can then assume she had the opportunity to kill Alice.
Because Gabrielle has been in conflict zones with military personnel, it's possible that she has access to a grenade. We can say that Gabrielle has a potential means to kill the victim.
Finally, it appears that Gabrielle and Alice had a rivalry over journalism, which could have boiled up into physical action. Because of this, we can say that Gabrielle has a potential motive to kill the victim.
Now, reviewing the evidence, we see that:
Eugene has a motive and opportunity but no means.
Gabrielle has a motive, means, and opportunity.
Therefore, Gabrielle is the most likely murderer.
ANSWER: 2
""".strip()
murder_mystery_solved_ex = f'{story}\n\n{reasoning}'

View File

@ -0,0 +1,309 @@
# flake8: noqa: E501
import json
import os.path as osp
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path
from .murder_mystery_solved_ex import murder_mystery_solved_ex
from .object_placements_solved_ex import object_placements_solved_ex
from .team_allocation_solved_ex import team_allocation_solved_ex
from .tree import LogicTree
DATASET_CONFIGS = {
'murder_mysteries': {
'file_name':
'murder_mysteries.json',
'ex':
murder_mystery_solved_ex, # write user example here
'system_prompt':
'You are a helpful assistant that will answer the questions given by the user.',
'hint':
('Before selecting a choice, explain your reasoning step by step. '
'The murderer needs to have a means (access to weapon), motive (reason to kill the victim), '
'and opportunity (access to crime scene) in order to have killed the victim. '
'Innocent suspects may have two of these proven, but not all three. '
'An innocent suspect may be suspicious for some other reason, but they will not have all of motive, '
'means, and opportunity established.\n\n'
'If you believe that both suspects have motive, means, and opportunity, you should make an educated guess '
'and pick the one for whom these are best established. If you believe that neither suspect has all '
'three established, then choose the suspect where these are most clearly established.'
),
'hint_before_question':
False,
'answer_index_modifier':
1
},
'object_placements': {
'file_name':
'object_placements.json',
'ex':
object_placements_solved_ex,
'skip_ablated':
True,
'ablation_depth_modifier':
2,
'system_prompt':
'You are a helpful assistant that will answer the questions given by the user.',
'hint':
('Based on this story, we want to identify where someone believes that a certain object is at the end of '
'the story. In order to do that, you need to read the story and keep track of where they think the object '
'is at each point. When an object is moved, the person may observe its new location if they saw it move.\n\n'
'To see where an object ends up, they must be able to see the location that it moves to and not be too '
'distracted by what they are doing. If they do not observe the object moving, then they will still believe '
'it to be in the last location where they observed it.'),
'hint_before_question':
True,
'answer_index_modifier':
1
},
'team_allocation': {
'file_name':
'team_allocation.json',
'ex':
team_allocation_solved_ex,
'system_prompt':
'You are a helpful assistant that will answer the questions given by the user.',
'hint':
('The story should allow you to determine how good each person is at a skill. Roughly, each person is '
'either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks '
'that uses their skills as well as possible. In addition, one task will have to have two people assigned '
'to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the '
'overall quality of the assignment.\n\n'
'When two people need to work on a task and one is bad at it, they don\'t necessarily benefit from the '
'other person being good, unless they work well together.\n\n'
'With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team '
'to find the single assignment to ensure that the tasks overall are completed as effectively as possible.'
),
'hint_before_question':
False,
'answer_index_modifier':
1
}
}
@LOAD_DATASET.register_module()
class MusrDataset(BaseDataset):
"""MuSR.
Args:
path (str): path to dataset
name (str): name of dataset
self_consistency_n (int)
exclude_contrastive_examples (bool): Whether to exclude contrastive examples
reverse_contrastive_sample (bool): Whether to reverse the selection of contrastive samples
skip_ablated (bool): Whether to skip ablated samples
offset (int): Starting offset for the dataset
sample_size (int): Sample size, None indicates using the entire dataset.
"""
@staticmethod
def load(path,
name,
self_consistency_n=1,
exclude_contrastive_examples=False,
reverse_contrastive_sample=False,
skip_ablated=False,
randomize=False,
offset=0,
sample_size=None,
**kwargs):
"""Load the dataset and flatten fields while constructing prompts,
taking self_consistency_n and ablations into account."""
if name not in DATASET_CONFIGS:
raise ValueError(
f'Dataset name {name} not supported. Must be one of {list(DATASET_CONFIGS.keys())}'
)
config = DATASET_CONFIGS[name]
path = get_data_path(path)
file_path = osp.join(path, config['file_name'])
with open(file_path, 'r', encoding='utf-8') as f:
dataset = json.load(f)
filtered_dataset = []
hashes_done = []
for example in dataset:
if exclude_contrastive_examples and example['questions'][0].get('intermediate_data') and \
len(example['questions'][0].get('intermediate_data')) > 0 and \
example['questions'][0]['intermediate_data'][0].get('story_hash_id'):
story_hash = example['questions'][0]['intermediate_data'][0][
'story_hash_id']
if story_hash in hashes_done:
if reverse_contrastive_sample:
filtered_dataset.append(example)
else:
continue
elif not reverse_contrastive_sample:
filtered_dataset.append(example)
hashes_done.append(story_hash)
else:
filtered_dataset.append(example)
filtered_dataset = filtered_dataset[
offset:offset +
min(len(filtered_dataset), sample_size) if sample_size else None]
ablations = [
# {'prompt': 'regular', 'name': 'regular'},
# {'prompt': 'cot', 'name': 'cot'},
{
'prompt': 'cot+',
'name': 'cot+'
},
]
# create prompts
flattened_data = []
for example in filtered_dataset:
context = example['context']
questions = example['questions']
for question in questions:
choices_list = question['choices']
choices_str = '\n'.join([
f'{idx + 1} - {choice}'
for idx, choice in enumerate(choices_list)
])
gold_answer = question['answer'] + config.get(
'answer_index_modifier', 1)
for ablation in ablations:
prompt_style = ablation.get('prompt', 'cot+')
ablation_name = ablation.get('name', 'cot+')
for scidx in range(self_consistency_n):
ex_str = ''
if ablation.get('use_example') and config.get('ex'):
ex_str = (
'Here is an example of solving the task:\n\n' +
config.get('ex') +
'\n\nThis is the end of the example. The real task is below.\n\n---\n\n'
)
if prompt_style == 'regular':
prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
f'Pick one of the following choices:\n{choices_str}\n\n' \
'You must pick one option. Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"'
elif prompt_style == 'cot':
prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
f'Pick one of the following choices:\n{choices_str}\n\n' \
'You must pick one option. Explain your reasoning step by step before you answer. ' \
'Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"'
elif prompt_style == 'cot+':
if config.get('hint_before_question'):
prompt = f'{ex_str}{context}\n\n{config["hint"]}\n\n{question["question"]}\n\n' \
f'Pick one of the following choices:\n{choices_str}\n\n' \
'You must pick one option. Explain your reasoning step by step before you answer. ' \
'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"'
else:
prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \
f'Pick one of the following choices:\n{choices_str}\n\n' \
f'You must pick one option. {config["hint"]} Explain your reasoning step by step before you answer. ' \
'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"'
else:
if len(question['intermediate_trees']
) == 0 or config.get('skip_ablated', False):
continue
prompt = f'{ex_str}Answer the following questions given the list of facts per answer choice.\n\n'
for c, t in zip(choices_str.split('\n'),
question['intermediate_trees']):
# extract facts from intermediate_trees
facts = list(
set([
x.value for x in
LogicTree.from_json(t).get_facts(
include_cs=ablation.get(
'include_cs', False),
include_deductions_from_level=-1,
no_facts_after_depth=ablation.get(
'no_facts_after_depth', 3) +
config.get(
'ablation_depth_modifier', 0))
]))
if config.get('allow_sorted_facts', True):
facts = sorted(facts)
facts_str = '\n'.join(
[f'- {fact}' for fact in facts])
prompt += f'Facts for Choice {c}:\n{facts_str}\n\n'
prompt += f'Given the list of facts per answer choice, answer the following question\n\n' \
f'{question["question"]}\n\n' \
f'Pick one of the following choices:\n{choices_str}\n\n' \
'You must pick one option. After you have found the answer, say it in this format "ANSWER: (your answer here, include the choice number)"'
flattened_example = {
'context':
context,
'question_text':
question['question'],
'question':
question,
'answer':
question['answer'],
'choices':
choices_list,
'choices_str':
choices_str,
'intermediate_trees':
question.get('intermediate_trees', []),
'intermediate_data':
question.get('intermediate_data', []),
'prompt':
prompt,
'system_prompt':
config.get('system_prompt', ''),
'gold_answer':
gold_answer,
'scidx':
scidx, # self-consistency index
'self_consistency_n':
self_consistency_n,
'ablation_name':
ablation_name,
}
flattened_data.append(flattened_example)
dataset = Dataset.from_list(flattened_data)
return dataset
@ICL_EVALUATORS.register_module()
class MusrEvaluator(BaseEvaluator):
def __init__(self, answer_index_modifier=1, self_consistency_n=1):
self.answer_index_modifier = answer_index_modifier
self.self_consistency_n = self_consistency_n
def score(self, predictions, references):
correct = 0
assert len(predictions) == len(
references
), 'Predictions and references must have the same length!'
total = len(predictions)
for pred, ref in zip(predictions, references):
if 'ANSWER:' in pred:
answer_line = [
line for line in pred.split('\n') if 'ANSWER:' in line
]
if answer_line:
answer = answer_line[0].split('ANSWER:')[-1].strip()
import re
match = re.search(r'\d+', answer)
if match:
pred_answer = int(match.group())
if pred_answer == ref:
correct += 1
accuracy = 100 * correct / total if total > 0 else 0
return {'accuracy': accuracy}

View File

@ -0,0 +1,53 @@
# flake8: noqa: E501
story = '''
Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise.
Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to helpsilently, he decided to deploy his best skillpatiently awaiting his moment to help, maybe when Petra asked for something from the pantry. Amidst the excitement, there was Clara, their maidever diligent and efficient, trying to keep the surroundings perfect for this special evening.
Tucked away, under the counter, was Petra's secret recipe book, her culinary treasure. Her solace in confusing times, her secret weapon during their flavorful adventures. While toward the back of the pantry, was the glass jar of Petra's favorite spice blendssomething that Daniel was well aware of, in case an opportunity arose for him to assist or distract when Petra might need it.
All three residents of the home were aware of each item's location. The secret recipe book under the counter, the glass jar in the pantry, and the anxious excitement that filled the air—a fragrance even more intoxicating than the delicious smells that would soon fill the kitchen.
With tact and secrecy, Petra relocated her cherished recipe book from its hidden spot under the counter to its temporary home on the kitchen table. The pages were swiftly opened to reveal her secret recipes which she was eager to start preparing for the long-awaited anniversary surprise. While Petra was engrossed in her preparations, Clara continued her sweeping routine in the kitchen. Clara's steady broom strokes on the wooden floor echoed a little in the otherwise busy and humming kitchen. In the background, beyond the kitchen door, Daniel could be seen in the dining room, meticulously setting the table for the anticipated special dinner.
The placement of the rooms allowed Clara to easily notice Petra's movements in her peripheral vision while she was executing her chores. Every move Petra made was observed in Clara's line of sight. Simultaneously, separated by the walls, Daniel was diligently arranging the tableware in the dining room which was separate from Petra's bustling kitchen.
Hoping to spruce up the setting, Daniel delicately relocated a glass jar filled with decorative pebbles to the center of the dining table. His subtle contribution for the evening - a perfectly presentable table for their special anniversary dinner. Amidst the flurry of the special day's preparations, Clara diligently carried on with her duties in the upstairs bathroom, unseen from the dining room. Meanwhile, Petra was wholly engrossed in the allure of a new recipe in her cherished, hidden book which lay opened on the kitchen island, away from prying eyes of the dining room.
In the middle of her usual tidying, Clara spotted Petra's treasured recipe book on the kitchen table. Ensuring it stayed clandestine, Clara carefully transferred it back to its usual hideaway spot beneath the counter. In the midst of the anniversary excitement, Clara deftly transferred Petra's secret weapon back to its hiding place when Daniel stepped out into the garage to retrieve extra utensils. Performing her duty with a sense of urgency, she made sure to move quietly to not disturb Petra, who was engrossed in the process of boiling a massive pot of pasta water on the stove.
Despite the commotion and fervor in the kitchen, the hubbub did not stretch as far as the garage, which remained undisturbed by the domestic activity occurring in the main part of the house. Meanwhile, in the kitchen, Petra was oblivious to Clara's subtle maneuver while she busied herself at the stove, focused on making sure the large pot of water reached the perfect boil.
In the end, the careful orchestration of duties by each individual within the house concluded in a harmonious anniversary celebration. The marks of a successful evening consisted of a delectable meal, a serene atmosphere, and the memory of a smooth, incident-free evening where everyone played their role to perfection.
Based on this story, we want to identify where someone believes that a certain object is at the end of the story. In order to do that, you need to read the story and keep track of where they think the object is at each point. When an object is moved, the person may observe its new location if they saw it move.
To see where an object ends up, they must be able to see the location that it moves to and not be too distracted by what they are doing. If they do not observe the object moving, then they will still believe it to be in the last location where they observed it.
Which location is the most likely place Clara would look to find the glass jar given the story?
Pick one of the following choices:
1 - dining table
2 - kitchen table
3 - pantry
4 - under counter
You must pick one option. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
'''.strip()
reasoning = '''
Let's solve this by thinking step-by-step. We want to know where Clara will check to find the glass jar, so let's track where Clara sees the glass jar throughout the story.
At the beginning of the story, it is stated that "All three residents of the home were aware of each item's location... the glass jar in the pantry." From this, we can conclude that the first place in the story where Clara sees the glass jar is in the pantry.
Throughout the story, the glass jar only moves once to the dining table. However, while Daniel was moving the glass jar, Clara was upstairs in the restroom carrying out her duties. It's highly unlikely that she saw Daniel move the glass jar, so we can assume that she still believes it to be in the pantry.
Clara does go to the kitchen in the story and moves a recipe book from the kitchen table, but because it's the kitchen table and not the dining room table, we can assume she hasn't seen the glass jar there.
Now, given the story and evidence, we can assume that Clara believes the glass jar to be in the pantry.
ANSWER: 3
'''.strip()
object_placements_solved_ex = f'{story}\n\n{reasoning}'

View File

@ -0,0 +1,72 @@
# flake8: noqa: E501
story = '''
In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroesthe maintenance crewensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to serve in one of these crucial roles. The challenge now lay in the hands of the manager, who had to assign them to either teaching or maintenance, a decision that would set the course for their contributions to the school.
Angela was a fiercely independent woman, beset with a unique set of strengths and weaknesses. She was a woman of very few words, often finding it hard to articulate her thoughts and explain things clearly. Venturing into education seemed a maze with her apathetic attitude towards learning. She was also seen to be disinterested in reading and the literary field as a whole. This was a juxtaposition to her inability to contribute to maintenance duties because of her fear of tools and machinery, a sinister remnant of a past accident that still haunted her. The basic handyman skills, which most locals learned growing up, were also absent from her repertoire.
Angela's interactions with Greg and Travis further complicated the equation. On one hand, Greg and Angela had a habit of arguing constantly over trivial matters, which once culminated in their failure to complete a shared basic training exercise adequately. On the other hand, Angela and Travis simply had nothing in common. Their conversations were often fraught with awkward silences, indicative of their lack of shared interests. This lack of coordination was epitomized during a recent team-building exercise when their team finished last.
Greg was the blue-collar type with a broad frame and muscular build. He had a work ethic that never shied away from toiling through the day to get things done. Growing up, he often helped his father with simple home repairs and minor renovations, learning the ropes of basic handiwork. Additionally, Greg had fortified his skills while refurbishing an old shed with Travis, a testament to their compatible personalities. However, his dislike for education was well known throughout town, further amplified by his lack of patience, especially with children.
Travis, the third cog in the wheel, was a man of many peculiarities. His stage fright was almost legendary and made it nearly impossible for him to stand in front of a crowd. Often, the mere thought of it could unnerve him. His physical constitution was lightweight and fragile, and long hours of manual labor made him weary. He also had a revulsion towards dirt that he complained about at every opportune moment. Like the others, studying did not appeal to him much, so much so that he had stopped reading completely after leaving school prematurely.
The manager understood well that a teams success depends heavily on the contribution and compatibility of each member. He observed, analyzed, and considered. Now, it was up to him to assign roles to Angela, Greg, and Travis. The school needed educators and maintenance staff, and each had to play their part perfectly.
Given the story, how would you uniquely allocate each person to make sure both tasks are accomplished efficiently?
Pick one of the following choices:
1 - Teaching: Travis, Maintenance: Angela and Greg
2 - Teaching: Greg, Maintenance: Angela and Travis
3 - Teaching: Angela, Maintenance: Greg and Travis
You must pick one option. The story should allow you to determine how good each person is at a skill. Roughly, each person is either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks that uses their skills as well as possible. In addition, one task will have to have two people assigned to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the overall quality of the assignment.
When two people need to work on a task and one is bad at it, they dont necessarily benefit from the other person being good, unless they work well together.
With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team to find the single assignment to ensure that the tasks overall are completed as effectively as possible.
Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"
'''.strip()
reasoning = '''
Let's solve this by thinking step-by-step. First, we will figure out each person's skill level for each task. Then, we can measure how well they all work together in pairs. From this, we can find the most efficient assignment that maximizes the scores.
Let's start with Angela. Angela can't articulate her thoughts, and she seems unprepared for teaching. So, let's assume her skill level is 1 for teaching. She also is bad at maintenance due to her fear of maintenance. So, let's assume her skill level is 1 for maintenance as well.
Now, let's look at Greg. Greg has a dislike for education and a lack of patience, so let's assume his skill level for maintenance is 1. However, Greg has helped with home repairs and minor renovations, so let's assume his maintenance skill level is 2.
Finally, let's look at Travis. Travis has extreme stage fright, which will make it difficult to teach, so let's assume his teaching skill level is 1. He also has a lightweight and fragile frame as well as hates dirt, so let's assume his maintenance skill level is 1.
Now, let's look at the relationships and how people work together.
Angela and Greg do not get along; they are constantly arguing, so let's assume their ability to work together is 1.
Angela and Travis aren't much better. They both have nothing in common, and they couldn't do a team-building exercise previously, so let's assume their ability to work together is 1.
Finally, Greg and Travis have worked together, and their personalities seem to meld, so let's assume they work well together with a score of 3.
Let's summarize and figure out the best assignment.
Angela is bad at teaching. (1)
Angela is bad at maintenance. (1)
Angela does not work well with Greg. (1)
Angela does not work well with Travis. (1)
Greg is bad at teaching. (1)
Greg is okay with maintenance. (2)
Greg and Travis work well together. (3)
Travis is bad at teaching. (1)
Travis is bad at maintenance. (1)
Now, let's find the best assignment.
Option 1: Travis as a teacher (1) + Angela working in maintenance (1) + Greg working in maintenance (2) + Angela and Greg work badly together (1) = 5
Option 2: Greg as a teacher (1) + Angela working in maintenance (1) + Travis working in maintenance (1) + Angela and Travis work badly together (1) = 4
Option 3: Angela as a teacher (1) + Greg working in maintenance (2) + Travis working in maintenance (1) + Greg and Travis work well together (3) = 7
So, from this, we can see Option 3 has the maximum score.
ANSWER: 3
'''.strip()
team_allocation_solved_ex = f'{story}\n\n{reasoning}'

View File

@ -0,0 +1,739 @@
# flake8: noqa: E501
"""WARNING (or more like an aggressive note).
A lot of functionality was implemented here for earlier experiments. Most of which is not used. We have left it here
for backwards compatibility with the current dataset as well as because why not.
ALSO NOTE:
This file was created to have no dependencies on anything in the repo for a reason. You can copy this file into your
own project and use the classes to parse/visualize/edit the logic trees in the dataset or create your own.
FINAL NOTE:
See examples of how to create LogicNodes and LogicTrees in the __main__ part of the file.
"""
import random
from copy import deepcopy
from enum import Enum
from typing import Any, Dict, List
import numpy as np
class LogicNodeOperatorType:
"""How should the deduction combine the nodes (choose will randomly sample
and/or when populate is called)"""
AND = 'and'
OR = 'or'
CHOOSE = 'choose'
class LogicNodeFactType:
"""Is a node explicit (mentioned in the story) or commonsense knowledge
(left unsaid)"""
EXPLICIT = 'explicit'
COMMONSENSE = 'commonsense'
class LogicNodeConstraints:
"""Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though."""
ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true'
class LogicNodeDeductionType:
"""What type of deduction should be used here (not used currently)"""
SYLLOGISM = 'syllogism'
TEMPORAL = 'temporal'
SPATIAL = 'spatial'
CHOOSE = 'choose'
class LogicNode:
"""A LogicNode is a tree primitive.
It is either a deduction or a leaf fact. Leaf facts are the ones that we
use in story generation (if they are explicit facts and not commonsense).
"""
value: str
children: List['LogicNode']
fact_type: str
operator: str
constraints: List[str]
deduction_type: str
prunable: bool
can_be_leaf: bool
def __init__(
self,
value: str = '',
children: List['LogicNode'] = None,
operator: str = LogicNodeOperatorType.OR,
fact_type: str = LogicNodeFactType.EXPLICIT,
constraints: List[str] = (),
deduction_type: str = None,
prunable: bool = True,
can_be_leaf: bool = False,
frozen: bool = False,
):
"""
:param value: Content for this specific node (also the deduction of the children).
:param children: The children for this node.
:param operator: Should the children be "And"ed or "Or"ed to create the deduction (the content of this node).
:param fact_type: Explicit or commonsense
:param constraints: Not used anymore (see LogicNodeConstraints)
:param deduction_type: Not used anymore (see LogicNodeDeductionType)
:param prunable: Can this node be removed from the tree (we don't prune in our datasets)
:param can_be_leaf: Can this node be a leaf node (usually false for nodes that you are injecting manually)
:param frozen: Should we add/prune children in the populate function (if frozen, no children will be added or removed, but the children may have children appended/pruned from them).
"""
self.value = value
if children is None:
children = []
self.children = children
self.operator = operator
self.fact_type = fact_type
self.constraints = constraints
self.deduction_type = deduction_type
self.prunable = prunable
self.can_be_leaf = can_be_leaf
self.frozen = frozen
self.parent = None
@property
def children(self):
return self._children
@children.setter
def children(self, children: List['LogicNode']):
self._children = children
for c in self.children:
c.parent = self
def __str__(self):
line = []
cnsts = ', '.join([str(x.value) for x in self.constraints])
if self.value and self.value != '':
line.append(self.value)
if len(self.children) > 0:
line.append(self.operator)
else:
line.append(self.fact_type)
if self.deduction_type:
line.append(self.deduction_type)
if len(self.constraints) > 0:
line.append(cnsts)
if len(self.children) > 0:
line.append(f'children: {len(self.children)}')
return ' | '.join(line)
def __repr__(self):
return str(self)
def to_json(self):
return {
'value': self.value,
'children': [x.to_json() for x in self.children],
'fact_type': self.fact_type,
'operator': self.operator,
'constraints': self.constraints,
'deduction_type': self.deduction_type,
'prunable': self.prunable,
'can_be_leaf': self.can_be_leaf
}
@classmethod
def from_json(cls, js):
js['children'] = [LogicNode.from_json(x) for x in js['children']]
return cls(**js)
class LogicTree:
"""Main datastructure used when creating a MuSR example.
It's basically a standard tree with some parameters controlling the shape.
"""
nodes: List[LogicNode]
chance_of_or: float
chance_of_cs_fact: float
depth: int
chance_to_prune: float
chance_to_prune_all: float
bf_factor: Dict[int, float]
deduction_type_sample_rate: Dict[LogicNodeDeductionType, float]
root_structure: List[List[LogicNode]] = ()
def __init__(self,
chance_of_or: float = 0.3,
chance_of_cs_fact: float = 0.1,
depth: int = 2,
chance_to_prune: float = 0.6,
chance_to_prune_all: float = 0.2,
bf_factor: Dict[int, float] = None,
deduction_type_sample_rate: Dict[LogicNodeDeductionType,
float] = None,
enforce_cs_fact_per_level: bool = False,
root_structure: List[Any] = (),
nodes: List[LogicNode] = (),
populate: bool = True,
prune: bool = True):
"""
:param chance_of_or: (not used) how often should a node with children be an OR
:param chance_of_cs_fact: (not used) how often should there be a commonsense node
:param depth: How deep should a tree go
:param chance_to_prune: Percentage chance of pruning a node
:param chance_to_prune_all: Percentage chance of pruning all children from a node.
:param bf_factor: Branching factor (dictionary of percentages {1: 0.33, 2:0.33, 3:0.33} for example.
:param deduction_type_sample_rate: (not used, see bf_factor and LogicNodeDeductionType)
:param enforce_cs_fact_per_level: Enforce 1 commonsense fact per level in the tree (we use this instead of chance_of_cs_fact)
:param root_structure: List of LogicNodes to build off of.
:param nodes: List of LogicNodes to define the LogicTree on (we will not populate/prune the tree if this is filled)
:param populate: Should we populate children for the tree according to the other parameters?
:param prune: Should we prune the children for the tree according to the other parameters?
"""
self.chance_of_or = chance_of_or
self.chance_of_cs_fact = chance_of_cs_fact
self.depth = depth
self.chance_to_prune = chance_to_prune
self.chance_to_prune_all = chance_to_prune_all
self.bf_factor = bf_factor
self.enforce_cs_fact_per_level = enforce_cs_fact_per_level
if not bf_factor:
self.bf_factor = {2: 0.8, 3: 0.2}
if not deduction_type_sample_rate:
deduction_type_sample_rate = {
LogicNodeDeductionType.SYLLOGISM: 1.0
}
self.deduction_type_sample_rate = deduction_type_sample_rate
self.root_structure = root_structure
if len(nodes) > 0:
self.nodes = nodes
else:
if root_structure is not None and len(root_structure) > 0:
self.nodes = root_structure
else:
self.nodes = [
LogicNode('root', operator=LogicNodeOperatorType.AND)
]
if populate:
[self.populate(x, 1) for x in self.nodes]
if prune:
[self.prune(x, 1) for x in self.nodes]
def __str__(self):
return self.print_tree()
def get_facts(self,
include_cs: bool = False,
include_deductions_from_level: int = -1,
no_facts_after_depth: int = -1):
"""Get a list of LogicNodes from the tree. By default, you will get the
explicit leaf nodes.
:param include_cs: Include the commonsense nodes from all levels.
:param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper.
:param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes.
"""
def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]:
node = deepcopy(_node)
if depth >= no_facts_after_depth and no_facts_after_depth > -1:
node.children = []
facts = []
if node.fact_type == LogicNodeFactType.EXPLICIT and len(
node.children) == 0:
facts.append(node)
if node.fact_type == LogicNodeFactType.COMMONSENSE and include_cs and len(
node.children) == 0:
facts.append(node)
if len(
node.children
) > 0 and include_deductions_from_level <= depth and include_deductions_from_level > -1:
facts.append(node)
for child in node.children:
facts.extend(recurse_facts(child, depth + 1))
return list(set(facts))
facts = []
for n in self.nodes:
facts.extend(recurse_facts(n))
return facts
def print_tree(self, node=None, level=0):
"""Deprecated (not used)"""
if node is None:
node = self.nodes[0]
line = '-' * level * 4 + str(node) + (' | ' + str(node.operator) if
len(node.children) > 0 else '')
for child in node.children:
line += '\n' + self.print_tree(child, level + 1)
return line
def print_for_gpt(self,
node=None,
level=0,
pad_char=' ',
pad_space=4,
print_forward=True,
print_conjection_types: bool = False,
print_reasoning_types: bool = False,
ignore_value_after_depth: int = -1,
print_only_nodes_with_value: bool = False):
"""Complex print function. We often use it as
print_for_gpt(pad_space=1, pad_char='> ')
However, more complex arguments can be used to control what is printed.
This returns a string that must be printed (don't be confused by the method name.)
:param node: Start at a specific node.
:param level: Controls how much tabbing is done when printing the current node.
:param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example)
:param pad_space: How many spaces to include between pad_chars
:param print_forward: Print the tree with parent nodes first.
:param print_conjection_types: Print the Ands and Ors per deduction (not used)
:param print_reasoning_types: Print the deduction types (not used)
:param ignore_value_after_depth: Ignore content of the nodes once a depth is met
:param print_only_nodes_with_value: Ignore nodes without content.
"""
line = ''
if node is None:
node = self.nodes[0]
if not print_forward:
for child in node.children:
v = self.print_for_gpt(
child,
level + 1,
pad_char=pad_char,
pad_space=pad_space,
print_forward=print_forward,
ignore_value_after_depth=ignore_value_after_depth,
print_only_nodes_with_value=print_only_nodes_with_value)
if v != '':
line += v + '\n'
ignore_val = ignore_value_after_depth > -1 and ignore_value_after_depth < level
ignore_line = print_only_nodes_with_value and node.value == ''
if ignore_line:
line_val = ''
else:
line_val = (node.value + ' | ' if node.value != '' and not ignore_val else '') + (
('Fact From Story' if node.fact_type == LogicNodeFactType.EXPLICIT else 'Commonsense Knowledge') \
if len(node.children) == 0 else 'Deduced Fact')
if level == 0:
line_val = (node.value + ' | ' if node.value != '' else
'') + 'Deduced Root Conclusion'
if len(node.children) > 0 and (print_conjection_types
or print_reasoning_types):
if print_conjection_types:
line_val += f' ({node.operator}'
else:
line_val += f'('
if node.deduction_type and print_reasoning_types:
line_val += f' | {node.deduction_type})'
else:
line_val += ')'
if len(node.constraints) > 0:
cnsts = ', '.join([str(x) for x in node.constraints])
line_val += f' constraints: [{cnsts}]'
line += pad_char * level * pad_space + line_val
if print_forward:
for child in node.children:
v = self.print_for_gpt(
child,
level + 1,
pad_char=pad_char,
pad_space=pad_space,
print_forward=print_forward,
ignore_value_after_depth=ignore_value_after_depth,
print_only_nodes_with_value=print_only_nodes_with_value)
if v != '':
line += '\n' + v
return line
def populate(self, node: LogicNode, current_depth: int = 1):
if node.operator == LogicNodeOperatorType.CHOOSE:
node.operator = LogicNodeOperatorType.OR \
if random.random() < self.chance_of_or else LogicNodeOperatorType.AND
if node.deduction_type == LogicNodeDeductionType.CHOOSE:
if node.operator != LogicNodeOperatorType.AND:
node.deduction_type = None
else:
node.deduction_type = random.choices(
list(self.deduction_type_sample_rate.keys()),
list(self.deduction_type_sample_rate.values()),
k=1)[0]
if not node.frozen:
bf = max(
0,
random.choices(list(self.bf_factor.keys()),
list(self.bf_factor.values()),
k=1)[0] - len(node.children))
if bf > 0:
new_nodes = []
one_fact_is_cs = False
for idx in range(bf):
roll_for_or = random.random()
fact_type = LogicNodeFactType.COMMONSENSE \
if random.random() < self.chance_of_cs_fact and not one_fact_is_cs else \
LogicNodeFactType.EXPLICIT
if roll_for_or > self.chance_of_or and\
current_depth < self.depth and\
not fact_type == LogicNodeFactType.COMMONSENSE:
new_nodes.append(
LogicNode(
f'',
operator=LogicNodeOperatorType.AND,
fact_type=fact_type,
deduction_type=random.choices(
list(self.deduction_type_sample_rate.keys(
)),
list(self.deduction_type_sample_rate.
values()),
k=1)[0],
prunable=True,
can_be_leaf=True,
))
else:
new_nodes.append(
LogicNode(f'',
operator=LogicNodeOperatorType.OR,
fact_type=fact_type,
prunable=True,
can_be_leaf=True))
if fact_type == LogicNodeFactType.COMMONSENSE:
node.operator = LogicNodeOperatorType.AND
if not node.deduction_type:
node.deduction_type = random.choices(
list(self.deduction_type_sample_rate.keys()),
list(self.deduction_type_sample_rate.values()),
k=1)[0]
one_fact_is_cs = True
if not one_fact_is_cs and self.enforce_cs_fact_per_level:
new_nodes.append(
LogicNode(f'',
operator=LogicNodeOperatorType.OR,
fact_type=LogicNodeFactType.COMMONSENSE,
prunable=False,
can_be_leaf=True))
node.children.extend(new_nodes)
if current_depth < self.depth:
for node in node.children:
if node.fact_type == LogicNodeFactType.COMMONSENSE:
continue
self.populate(node, current_depth + 1)
def prune(self, node: LogicNode, current_depth: int = 1):
to_prune = []
if current_depth > 1 and node.can_be_leaf:
if random.random() < self.chance_to_prune_all:
node.children = []
return
prunable = [x for x in node.children if x.prunable]
if (len(prunable) > 1 and node.operator == LogicNodeOperatorType.OR or\
len(prunable) > 2 and node.operator == LogicNodeOperatorType.AND) and\
current_depth <= self.depth:
if node.prunable:
for n in random.sample(
prunable,
len(prunable) -
(1 if node.operator == LogicNodeOperatorType.OR else 2)):
roll_to_prune = random.random()
if roll_to_prune < self.chance_to_prune:
to_prune.append(n)
node.children = [x for x in node.children if x not in to_prune]
for n in node.children:
self.prune(n, current_depth + 1)
def to_json(self):
args = {
'chance_of_or': self.chance_of_or,
'depth': self.depth,
'chance_to_prune': self.chance_to_prune,
'chance_to_prune_all': self.chance_to_prune_all,
'bf_factor': self.bf_factor,
'deduction_type_sample_rate': self.deduction_type_sample_rate,
'root_structure': [x.to_json() for x in self.root_structure],
'nodes': [x.to_json() for x in self.nodes]
}
return args
@classmethod
def from_json(cls, _js):
js = deepcopy(_js)
js['nodes'] = [LogicNode.from_json(x) for x in js['nodes']]
js['root_structure'] = [
LogicNode.from_json(x) for x in js['root_structure']
]
return cls(**js)
if __name__ == '__main__':
"""EXAMPLE USES."""
def tv_scene_ex():
root_structure = [
LogicNode('A good drama tv scene',
operator=LogicNodeOperatorType.OR,
prunable=False,
can_be_leaf=False,
frozen=True)
]
root_structure[0].children = [
LogicNode('Bob is sad.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
LogicNode('John now hates Bob.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
LogicNode('Bob bought a car.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
LogicNode('Bob wanted to be happy.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
]
tree = LogicTree(depth=4,
root_structure=root_structure,
bf_factor={
1: 0.5,
2: 0.5
},
chance_of_or=0.0,
chance_of_cs_fact=0.0,
chance_to_prune_all=0.5,
chance_to_prune=0.5,
enforce_cs_fact_per_level=True)
rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
print(rep)
def eb_ex():
root_structure = [
LogicNode('',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False)
]
n = LogicNode('Eruptions block sunlight.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False,
frozen=True)
n.children = [
LogicNode('Eruptions produce ash clouds.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=True,
frozen=True),
LogicNode('Ash blocks sunlight.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=True,
frozen=True),
]
g = LogicNode('Eruptions can cause plants to die.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False,
frozen=True)
g.children = [
n,
LogicNode('Producers will die without sunlight.',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=True,
frozen=True)
]
l = LogicNode('',
operator=LogicNodeOperatorType.AND,
prunable=False,
can_be_leaf=False)
l.children = [g]
root_structure[0].children = [l]
tree = LogicTree(depth=5,
root_structure=root_structure,
bf_factor={
1: 0.3,
2: 0.7
},
chance_of_or=0.0,
chance_of_cs_fact=0.0,
chance_to_prune_all=0.0,
chance_to_prune=0.0,
enforce_cs_fact_per_level=True)
rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
print(rep)
def murder_mystery_ex():
root_structure = [
LogicNode('Killer',
operator=LogicNodeOperatorType.OR,
constraints=[LogicNodeConstraints.ONLY_ONE_CAN_BE_TRUE],
prunable=False,
can_be_leaf=False,
frozen=True)
]
suspect_nodes = [
LogicNode(f'Murderer Suspect {idx + 1}',
operator=LogicNodeOperatorType.AND,
prunable=False,
can_be_leaf=False,
frozen=True) for idx in range(1)
]
for s in suspect_nodes:
s.children = [
LogicNode('Suspect has means',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
LogicNode('Suspect has motive',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False),
LogicNode('Suspect has opportunity',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False)
]
root_structure[0].children = suspect_nodes
tree = LogicTree(depth=4,
root_structure=root_structure,
bf_factor={
1: 0.5,
2: 0.5
},
chance_of_or=0.0,
chance_of_cs_fact=0.0,
chance_to_prune_all=0.5,
chance_to_prune=0.5,
enforce_cs_fact_per_level=True)
rep = tree.print_for_gpt(pad_space=1, pad_char='> ')
print(rep)
def action_ex():
root_structure = [
LogicNode('Take an action',
operator=LogicNodeOperatorType.OR,
prunable=False,
can_be_leaf=False,
frozen=True)
]
root_structure[0].children = [
LogicNode('Run away',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False,
frozen=True),
LogicNode('Fight back',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False,
frozen=True),
LogicNode('Hide',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False,
frozen=True),
]
for cidx, c in enumerate(root_structure[0].children):
nfacts = random.randint(2, 4)
for n in range(nfacts):
fact = LogicNode('',
operator=LogicNodeOperatorType.CHOOSE,
prunable=False,
can_be_leaf=False,
frozen=True)
fact.children = [
LogicNode('Pro (supporting the parent action)',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False,
frozen=False),
LogicNode('Con (counters the sibling Pro only)',
operator=LogicNodeOperatorType.CHOOSE,
prunable=True,
can_be_leaf=False,
frozen=False)
]
root_structure[0].children[cidx].children.append(fact)
tree = LogicTree(depth=4,
root_structure=root_structure,
bf_factor={
1: 0.25,
2: 0.5,
3: 0.25
},
chance_of_or=0.0,
chance_of_cs_fact=0.0,
chance_to_prune_all=0.5,
chance_to_prune=0.75,
enforce_cs_fact_per_level=True)
rep = tree.print_for_gpt(pad_space=1, pad_char='- ')
print(rep)
tv_scene_ex()
eb_ex()
action_ex()

View File

@ -2,6 +2,7 @@ import concurrent
import concurrent.futures
import os
import socket
import time
import traceback
from typing import Dict, List, Optional, Union
@ -20,6 +21,8 @@ from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
BAILING_RETRY_DELAY: int = 30
class HTTPAdapterWithSocketOptions(HTTPAdapter):
@ -200,6 +203,9 @@ class BailingAPI(BaseAPIModel):
break # success
elif response.status_code == 426:
retry_num += 1 # retry
elif response.status_code in [429, 500, 504]:
time.sleep(BAILING_RETRY_DELAY)
retry_num += 1 # retry
else:
raise ValueError(f'Status code = {response.status_code}')
else:

View File

@ -526,7 +526,7 @@ class OpenAISDK(OpenAI):
def _generate(self, input: PromptList | str, max_out_len: int,
temperature: float) -> str:
from openai import BadRequestError
from openai import APIStatusError, BadRequestError
assert isinstance(input, (str, PromptList))
# max num token for gpt-3.5-turbo is 4097
@ -616,7 +616,7 @@ class OpenAISDK(OpenAI):
from the API provider.')
return responses.choices[0].message.content
except BadRequestError as e:
except (BadRequestError, APIStatusError) as e:
# Handle BadRequest status
# You can specify self.status_code_mappings to bypass \
# API sensitivity blocks
@ -625,12 +625,10 @@ class OpenAISDK(OpenAI):
status_code = e.status_code
if (status_code is not None
and status_code in self.status_code_mappings):
original_error_message = e.body.get('message')
error_message = self.status_code_mappings[status_code]
self.logger.info(
f'Status Code: {status_code}, '
f'Original Error Message: {original_error_message},'
f'Return Message: {error_message} ')
self.logger.info(f'Status Code: {status_code},\n'
f'Original Error Message: {e},\n'
f'Return Message: {error_message} ')
return error_message
else:
self.logger.error(e)

View File

@ -87,6 +87,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
def generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
do_sample: Optional[bool] = None,
temperature: float = 1.0,
@ -123,7 +124,11 @@ class TurboMindModelwithChatTemplate(BaseModel):
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
gen_config.update(self.gen_config)
if do_sample or self.gen_config['do_sample']:
if max_out_len is not None:
gen_config['max_new_tokens'] = max_out_len
if min_out_len is not None:
gen_config['min_new_tokens'] = min_out_len
if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']):
gen_config['top_k'] = 40
gen_config['temperature'] = temperature
else:

View File

@ -101,7 +101,11 @@ class VLLM(BaseModel):
if not self.lora_path:
outputs = self.model.generate(inputs, sampling_kwargs)
else:
outputs = self.model.generate(inputs, sampling_kwargs, lora_request=LoRARequest("sql_adapter", 1, self.lora_path))
outputs = self.model.generate(inputs,
sampling_kwargs,
lora_request=LoRARequest(
'sql_adapter', 1,
self.lora_path))
prompt_list, output_strs = [], []
for output in outputs:

View File

@ -249,7 +249,7 @@ class VOLCRunner(BaseRunner):
with open(config_path) as fp:
volc_cfg = yaml.safe_load(fp)
if num_gpus <= 0:
flavor = 'ml.c1ie.2xlarge'
flavor = 'ml.c3i.2xlarge'
elif num_gpus == 1:
flavor = 'ml.pni2l.3xlarge'
elif num_gpus == 2:

View File

@ -167,7 +167,7 @@ DATASETS_MAPPING = {
"opencompass/math": {
"ms_id": "opencompass/math",
"hf_id": "opencompass/math",
"local": "./data/math/math.json",
"local": "./data/math/",
},
# MMLU
"opencompass/mmlu": {
@ -326,10 +326,24 @@ DATASETS_MAPPING = {
"ms_id": "",
"hf_id": "",
"local": "./data/mmmlu_lite",
}
},
"opencompass/musr": {
"ms_id": "",
"hf_id": "",
"local": "./data/musr",
},
"opencompass/babilong": {
"ms_id": "",
"hf_id": "",
"local": "./data/babilong/data/",
},
}
DATASETS_URL = {
"/musr": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip",
"md5": "7447d2a5bec4586035196102135e2af9",
},
"/mmlu/": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip",
"md5": "761310671509a239e41c4b717f7fab9c",
@ -360,7 +374,7 @@ DATASETS_URL = {
},
"/math/": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip",
"md5": "8b1b897259684672055e6fd4fc07c808",
"md5": "cb5b4c8378085929e20345174e731fdf",
},
"/hellaswag/": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip",
@ -426,6 +440,10 @@ DATASETS_URL = {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip",
"md5": "8a302712e425e27e4292a9369df5b9d3",
},
"subjective/followbench": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip",
"md5": "da7a831817c969da15d1e78d4a245d8a",
},
"subjective/mtbench101": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip",
"md5": "5d80257bc9929ebe5cfbf6d11184b04c",
@ -496,11 +514,11 @@ DATASETS_URL = {
},
"/aime": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip",
"md5": "fbe2d0577fc210962a549f8cea1a00c8"
"md5": "fbe2d0577fc210962a549f8cea1a00c8",
},
"/cmo": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
"md5": "fad52c81290506a8ca74f46b5400d8fc"
"md5": "fad52c81290506a8ca74f46b5400d8fc",
},
"/nq-open": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
@ -521,5 +539,9 @@ DATASETS_URL = {
"/WikiBench": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip",
"md5": "6dac1d1a3133fe1effff185cbf71d928",
}
},
"/babilong": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip",
"md5": "e400864c31bc58d29eaa3e199751f99b",
},
}

View File

@ -128,7 +128,10 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
text = text.strip()
match = re.search(pattern, text, re.DOTALL)
if match:
outputs = match.group(0)
if match.group(1) is not None and match.group(1) != '':
outputs = match.group(1)
else:
outputs = match.group(0)
for i in options:
if i in outputs:
return i

View File

@ -9,6 +9,7 @@ fuzzywuzzy
h5py
huggingface_hub<=0.24.7
immutabledict
importlib-metadata
jieba
json5
mmengine-lite