Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2024-09-02 13:57:05 +08:00 committed by GitHub
commit d66be56919
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1834 changed files with 77680 additions and 1683 deletions

View File

@ -2,48 +2,57 @@ from mmengine.config import read_base
with read_base():
# choose a list of datasets
from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from ...configs.datasets.race.race_ppl import \
from opencompass.configs.datasets.race.race_ppl import \
race_datasets # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
# read hf models - chat models
from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_2b import \
from opencompass.configs.models.gemma.hf_gemma_2b import \
models as hf_gemma_2b_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_7b import \
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from ...configs.models.hf_llama.lmdeploy_llama3_8b import \
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from ...configs.models.mistral.hf_mistral_7b_v0_2 import \
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen2_0_5b import \
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
models as hf_qwen2_0_5b_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_7b import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
models as lmdeploy_qwen2_7b_model # noqa: F401, E501
from ...configs.models.qwen.vllm_qwen1_5_0_5b import \
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
models as vllm_qwen1_5_0_5b_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_6b import \
from opencompass.configs.models.yi.hf_yi_1_5_6b import \
models as hf_yi_1_5_6b_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_9b import \
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501
from ...configs.summarizers.medium import summarizer # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -1,70 +1,105 @@
from mmengine.config import read_base
from opencompass.models import OpenAISDK
with read_base():
# choose a list of datasets
from ...configs.datasets.gsm8k.gsm8k_gen import \
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
from ...configs.datasets.race.race_gen import \
from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501
# read hf models - chat models
from ...configs.models.baichuan.hf_baichuan2_7b_chat import \
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
from ...configs.models.chatglm.hf_glm4_9b_chat import \
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
models as hf_glm4_9b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_7b_chat import \
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.vllm_deepseek_7b_chat import \
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_2b_it import \
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_7b_it import \
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
models as lmdeploy_internlm2_chat_1_8b_sft_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from ...configs.models.hf_llama.hf_llama3_8b_instruct import \
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501
from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from ...configs.models.phi.hf_phi_3_small_8k_instruct import \
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model # noqa: F401, E501
from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
models as vllm_qwen1_5_0_5b_chat_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_6b_chat import \
from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_9b_chat import \
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from ...configs.summarizers.medium import summarizer # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
model_name = ''
models.append(
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://judgemodel:10001/v1',
path='compass_judger_internlm2_102b_0508',
tokenizer_path='internlm/internlm2_5-20b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=50,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=3,
))
for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'

View File

@ -8,22 +8,25 @@ output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf',
'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf'
'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf',
'yi-1.5-9b-hf'
'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']
@ -77,6 +80,50 @@ class TestBase:
assert_score(result_score, base_score)
@pytest.mark.usefixtures('result_scores')
class TestCmdCase:
@pytest.mark.case1
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'),
('internlm2_5-7b-hf', 'race-high')])
def test_cmd_case1(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
@pytest.mark.case2
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat-turbomind', 'race-middle'),
('internlm2_5-7b-chat-turbomind', 'race-high')])
def test_cmd_case2(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
@pytest.mark.case3
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'),
('internlm2_5-7b_hf', 'race-high')])
def test_cmd_case3(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
@pytest.mark.case4
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat_hf', 'race-middle'),
('internlm2_5-7b-chat_hf', 'race-high')])
def test_cmd_case4(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)
def assert_score(score, baseline):
if score is None or score == '-':
assert False, 'value is none'

View File

@ -3,6 +3,11 @@ baichuan2-7b-chat-hf:
race-middle: 74
race-high: 79
glm-4-9b-chat-hf:
gsm8k: 75
race-middle: 88
race-high: 88
deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
@ -23,6 +28,16 @@ gemma-7b-it-hf:
race-middle: 74
race-high: 71
internlm2_5-7b-chat-hf:
gsm8k: 86
race-middle: 92
race-high: 93
internlm2_5-7b-chat-turbomind:
gsm8k: 87
race-middle: 92
race-high: 93
internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
@ -108,6 +123,10 @@ deepseek-moe-16b-base-hf:
race-middle: 35
race-high: 23
lmdeploy-api-test:
gsm8k: 90
race-middle: 95
race-high: 96
deepseek-7b-base-turbomind:
gsm8k: 21
@ -124,8 +143,18 @@ gemma-7b-hf:
race-middle: 59
race-high: 66
internlm2_5-7b-hf:
gsm8k: 46
race-middle: 92
race-high: 91
internlm2_5-7b-turbomind:
gsm8k: 73
race-middle: 90
race-high: 91
internlm2-1.8b-turbomind:
gsm8k: 27
gsm8k: 25
race-middle: 75
race-high: 72
@ -134,6 +163,11 @@ internlm2-7b-turbomind:
race-middle: 78
race-high: 76
internlm2-base-7b-hf:
gsm8k: 2
race-middle: 71
race-high: 74
internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75

View File

@ -14,6 +14,7 @@ env:
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
HF_DATASETS_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
@ -28,54 +29,69 @@ jobs:
uses: actions/checkout@v2
- name: Prepare - create conda env and install torch
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.0.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl --index-url https://download.pytorch.org/whl/cu118
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install bitsandbytes
pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} --extra-index-url https://download.pytorch.org/whl/cu118
pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: Prepare - Pip install code
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: Prepare - prepare data and hf model
run: |
cp -r ${{env.USERSPACE_PREFIX}}/data .
ln -s ${{env.DATEASET_CACHE_PATH}} data
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
- name: Run chat model test
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result_daily
export from_tf=TRUE
rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result_daily
export from_tf=TRUE
rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run command testcase
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
run: |
cp -r regression_result_daily/* /cpfs01/user/qa-llm-cicd/report
eval "$(conda shell.bash hook)"
rm -rf regression_result_daily
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}
conda info --envs

View File

@ -35,7 +35,7 @@ jobs:
uses: actions/checkout@v2
- name: Prepare - Install opencompass
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
@ -47,7 +47,7 @@ jobs:
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
- name: Run test
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result
@ -65,7 +65,7 @@ jobs:
- name: Uninstall opencompass
if: always()
run: |
eval "$(conda shell.bash hook)"
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
conda info --envs

View File

@ -10,25 +10,43 @@ exclude: |
opencompass/datasets/teval/|
opencompass/datasets/NPHardEval/|
opencompass/datasets/TheoremQA|
docs/zh_cn/advanced_guides/compassbench_intro.md|
docs/zh_cn/advanced_guides/compassbench_v2_0.md
docs/zh_cn/advanced_guides/compassbench_intro.md |
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
opencompass/configs/datasets/ |
opencompass/configs/models/|
opencompass/configs/summarizers/|
opencompass/configs/dataset_collections/ |
opencompass/utils/datasets.py |
opencompass/utils/datasets_info.py
)
repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://gitee.com/openmmlab/mirrors-isort
rev: 5.11.5
hooks:
- id: isort
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://gitee.com/openmmlab/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://gitee.com/openmmlab/mirrors-codespell
rev: v2.2.1
hooks:
@ -36,8 +54,10 @@ repos:
exclude: |
(?x)^(
.*\.jsonl|
opencompass/datasets/subjective/mtbench101.py|
configs/
.*\.md.template|
configs/ |
opencompass/configs/ |
example_scripts/
)
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
rev: v4.3.0
@ -88,6 +108,53 @@ repos:
pass_filenames: true
require_serial: true
files: ^configs/datasets
- repo: local
hooks:
- id: update-dataset-suffix-pacakge
name: dataset suffix updater(package)
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: false
# require_serial: true
# files: ^opencompass/configs/datasets
args:
- --root_folder
- opencompass/configs/datasets
- repo: local
hooks:
- id: compare-configs-datasets
name: compare configs datasets
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/datasets
- opencompass/configs/datasets
- repo: local
hooks:
- id: compare-configs-models
name: compare configs models
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/models
- opencompass/configs/models
- --ignore
- llama
- repo: local
hooks:
- id: compare-configs-summarizers
name: compare configs summarizers
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/summarizers
- opencompass/configs/summarizers
# - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at
# hooks:

View File

@ -13,24 +13,42 @@ exclude: |
opencompass/datasets/TheoremQA|
opencompass/datasets/subjective/mtbench101.py|
docs/zh_cn/advanced_guides/compassbench_intro.md |
docs/zh_cn/advanced_guides/compassbench_v2_0.md
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
opencompass/configs/datasets/ |
opencompass/configs/models/|
opencompass/configs/summarizers/ |
opencompass/configs/dataset_collections/ |
opencompass/utils/datasets.py |
opencompass/utils/datasets_info.py
)
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
exclude: configs/
exclude: |
(?x)^(
configs/ |
example_scripts/
)
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
@ -39,7 +57,9 @@ repos:
(?x)^(
.*\.jsonl|
.*\.md.template|
configs/
configs/ |
opencompass/configs/ |
example_scripts/
)
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
@ -90,6 +110,54 @@ repos:
pass_filenames: true
require_serial: true
files: ^configs/datasets
- repo: local
hooks:
- id: update-dataset-suffix-pacakge
name: dataset suffix updater(package)
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: false
# require_serial: true
# files: ^opencompass/configs/datasets
args:
- --root_folder
- opencompass/configs/datasets
- repo: local
hooks:
- id: compare-configs-datasets
name: compare configs datasets
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/datasets
- opencompass/configs/datasets
- repo: local
hooks:
- id: compare-configs-models
name: compare configs models
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/models
- opencompass/configs/models
- --ignore
- llama
- repo: local
hooks:
- id: compare-configs-summarizers
name: compare configs summarizers
entry: ./tools/compare_configs.py
language: script
pass_filenames: false
# require_serial: true
args:
- configs/summarizers
- opencompass/configs/summarizers
# - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at
# hooks:

2
MANIFEST.in Normal file
View File

@ -0,0 +1,2 @@
recursive-include opencompass/configs *.py *.yml *.json *.txt *.md
recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py

141
README.md
View File

@ -70,8 +70,11 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
- **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
- **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
- **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥
- **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥
- **\[2024.07.17\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
- **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥
- **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
- **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
@ -114,29 +117,51 @@ Below are the steps for quick installation and datasets preparation.
### 💻 Environment Setup
#### Open-source Models with GPU
We highly recommend using conda to manage your python environment.
```bash
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
conda activate opencompass
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
```
- #### Create your virtual environment
#### API Models with CPU-only
```bash
conda create --name opencompass python=3.10 -y
conda activate opencompass
```
```bash
conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
conda activate opencompass
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
```
- #### Install OpenCompass via pip
```bash
pip install -U opencompass
## Full installation (with support for more datasets)
# pip install "opencompass[full]"
## Environment with model acceleration frameworks
## Manage different acceleration frameworks using virtual environments
## since they usually have dependency conflicts with each other.
# pip install "opencompass[lmdeploy]"
# pip install "opencompass[vllm]"
## API evaluation (i.e. Openai, Qwen)
# pip install "opencompass[api]"
```
- #### Install OpenCompass from source
If you want to use opencompass's latest features, or develop new features, you can also build it from source
```bash
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# pip install -e ".[full]"
# pip install -e ".[vllm]"
```
### 📂 Data Preparation
You can choose one for the following method to prepare datasets.
#### Offline Preparation
You can download and extract the datasets with the following commands:
```bash
@ -145,12 +170,19 @@ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/Ope
unzip OpenCompassData-core-20240207.zip
```
Also, use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
#### Automatic Download from OpenCompass
We have supported download datasets automatic from the OpenCompass storage server. You can run the evaluation with extra `--dry-run` to download these datasets.
Currently, the supported datasets are listed in [here](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259). More datasets will be uploaded recently.
#### (Optional) Automatic Download with ModelScope
Also you can use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
Installation:
```bash
pip install modelscope
pip install modelscope[framework]
export DATASET_SOURCE=ModelScope
```
@ -166,32 +198,63 @@ Some third-party features, like Humaneval and Llama, may require additional step
## 🏗️ Evaluation
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared, you can evaluate the performance of the LLaMA-7b model on the MMLU and C-Eval datasets using the following command:
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
```bash
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
```
- Your first evaluation with OpenCompass!
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
```bash
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
```
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
# Python scripts
opencompass ./configs/eval_chat_demo.py
```
```bash
# List all configurations
python tools/list_configs.py
# List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu
```
You can find more script examples under [configs](./configs) folder.
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
- API evaluation
```bash
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
```
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# CLI
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python scripts
opencompass ./configs/eval_api_demo.py
```
- Accelerated Evaluation
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
# Python scripts
opencompass ./configs/eval_lmdeploy_demo.py
```
- Supported Models
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
```bash
# List all configurations
python tools/list_configs.py
# List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu
```
If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
> \[!TIP\]
>

View File

@ -69,8 +69,11 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
- **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
- **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型,欢迎试用!🔥🔥🔥
- **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集,您可以按需加载,无需事先下载全部数据到本地,欢迎试用!🔥🔥🔥
- **\[2024.07.17\]** 我们发布了CompassBench-202408榜单的示例数据和评测规则敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
- **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则,敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
- **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥
- **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级,欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
- **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端助力评测过程更加高效。除了默认的HuggingFace推理后端外还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ,支持命令行一键切换和部署 API 加速服务两种方式,详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。
@ -110,35 +113,54 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 🛠️ 安装
## 🛠️ 安装指南
下面展示了快速安装以及准备数据集的步骤。
下面提供了快速安装和数据集准备的步骤。
### 💻 环境配置
### 💻 环境搭建
#### 面向开源模型的GPU环境
我们强烈建议使用 `conda` 来管理您的 Python 环境。
```bash
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
conda activate opencompass
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
```
- #### 创建虚拟环境
#### 面向API模型测试的CPU环境
```bash
conda create --name opencompass python=3.10 -y
conda activate opencompass
```
```bash
conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
conda activate opencompass
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# 如果需要使用各个API模型`pip install -r requirements/api.txt` 安装API模型的相关依赖
```
- #### 通过pip安装OpenCompass
```bash
# 支持绝大多数数据集及模型
pip install -U opencompass
# 完整安装(支持更多数据集)
# pip install "opencompass[full]"
# 模型推理后端,由于这些推理后端通常存在依赖冲突,建议使用不同的虚拟环境来管理它们。
# pip install "opencompass[lmdeploy]"
# pip install "opencompass[vllm]"
# API 测试(例如 OpenAI、Qwen
# pip install "opencompass[api]"
```
- #### 基于源码安装OpenCompass
如果希望使用 OpenCompass 的最新功能,也可以从源代码构建它:
```bash
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# pip install -e ".[full]"
# pip install -e ".[vllm]"
```
### 📂 数据准备
#### 提前离线下载
OpenCompass支持使用本地数据集进行评测数据集的下载和解压可以通过以下命令完成
```bash
@ -147,6 +169,13 @@ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/Ope
unzip OpenCompassData-core-20240207.zip
```
#### 从 OpenCompass 自动下载
我们已经支持从OpenCompass存储服务器自动下载数据集。您可以通过额外的 `--dry-run` 参数来运行评估以下载这些数据集。
目前支持的数据集列表在[这里](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259)。更多数据集将会很快上传。
#### (可选) 使用 ModelScope 自动下载
另外,您还可以使用[ModelScope](www.modelscope.cn)来加载数据集:
环境准备:
@ -167,32 +196,59 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
## 🏗️ ️评测
确保按照上述步骤正确安装 OpenCompass 并准备好数据集后,可以通过以下命令评测 LLaMA-7b 模型在 MMLU 和 C-Eval 数据集上的性能:
在确保按照上述步骤正确安装了 OpenCompass 并准备好了数据集之后,现在您可以开始使用 OpenCompass 进行首次评估!
```bash
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
```
- ### 首次评测
另外,如果想使用除了 HuggingFace 外的推理后端进行加速评测,如 LMDeploy 或 vLLM可以通过以下命令。使用前请确保您已经安装了相应后端的软件包以及模型支持使用该后端进行加速推理更多内容见推理加速后端[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)下面以LMDeploy为例
OpenCompass 支持通过命令行界面 (CLI) 或 Python 脚本来设置配置。对于简单的评估设置,我们推荐使用 CLI而对于更复杂的评估则建议使用脚本方式。你可以在configs文件夹下找到更多脚本示例。
```bash
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
```
```bash
# 命令行界面 (CLI)
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
# Python 脚本
opencompass ./configs/eval_chat_demo.py
```
```bash
# 列出所有配置
python tools/list_configs.py
# 列出所有跟 llama 及 mmlu 相关的配置
python tools/list_configs.py llama mmlu
```
你可以在[configs](./configs) 文件夹下找到更多的脚本示例。
你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例:
- ### API评测
```bash
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
```
OpenCompass 在设计上并不区分开源模型与 API 模型。您可以以相同的方式或甚至在同一设置中评估这两种类型的模型。
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# 命令行界面 (CLI)
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python 脚本
opencompass ./configs/eval_api_demo.py
```
- ### 推理后端
另外,如果您想使用除 HuggingFace 之外的推理后端来进行加速评估,比如 LMDeploy 或 vLLM可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包并且您的模型支持该后端的加速推理。更多信息请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例:
```bash
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
```
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
- ### 支持的模型
```bash
# 列出所有配置
python tools/list_configs.py
# 列出所有跟 llama 及 mmlu 相关的配置
python tools/list_configs.py llama mmlu
```
如果模型不在列表中但支持 Huggingface AutoModel 类,您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
通过命令行或配置文件OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# from .datasets.collections.chat_medium import datasets
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
# from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# from .datasets.collections.chat_medium import datasets
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
# from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -6,8 +6,8 @@ from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,8 +5,8 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# from .datasets.collections.chat_medium import datasets
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
# from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# from .datasets.collections.chat_medium import datasets
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
# from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -5,9 +5,9 @@ from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# from .datasets.collections.chat_medium import datasets
from ..summarizers.medium import summarizer
from ..datasets.ceval.ceval_gen import ceval_datasets
# from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
datasets = [
*ceval_datasets,

View File

@ -1,22 +1,22 @@
from mmengine.config import read_base
with read_base():
from ..datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
from ..datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from ..datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
from ..datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
from ..datasets.race.race_gen_69ee4f import race_datasets
from ..datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
from ..datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
from ..datasets.bbh.bbh_gen_2879b0 import bbh_datasets
from ..datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..datasets.math.math_0shot_gen_393424 import math_datasets
from ..datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
from ..datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
from ..datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,53 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import ARCDataset
from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
{question}
A. {textA}
B. {textB}
C. {textC}
D. {textD}
""".strip()
ARC_c_reader_cfg = dict(
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
output_column='answerKey')
ARC_c_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=QUERY_TEMPLATE)
], ),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ARC_c_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
)
ARC_c_datasets = [
dict(
abbr='ARC-c',
type=ARCDataset,
path='opencompass/ai2_arc-dev',
name='ARC-Challenge',
reader_cfg=ARC_c_reader_cfg,
infer_cfg=ARC_c_infer_cfg,
eval_cfg=ARC_c_eval_cfg,
)
]

View File

@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import ARCDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
ARC_c_reader_cfg = dict(
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
output_column='answerKey',
)
ARC_c_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
),
dict(role='BOT', prompt='{answerKey}'),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
inferencer=dict(type=GenInferencer, max_out_len=50),
)
ARC_c_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_capital_postprocess),
)
ARC_c_datasets = [
dict(
abbr='ARC-c',
type=ARCDataset,
path='opencompass/ai2_arc-dev',
name='ARC-Challenge',
reader_cfg=ARC_c_reader_cfg,
infer_cfg=ARC_c_infer_cfg,
eval_cfg=ARC_c_eval_cfg,
)
]

View File

@ -0,0 +1,66 @@
# LCBench2023
LCBench2023 collects questions from leetcode weekly competitions between 2022 and 2023. It contains Chinese and English versions, each with 581 questions.
## Base Models
| model | lcbench/pass@1 | en/pass@1 | cn/pass@1 | lcbench/pass | lcbench/timeout | lcbench/failed | lcbench/wrong_answer | en/pass | en/timeout | en/failed | en/wrong_answer | cn/pass | cn/timeout | cn/failed | cn/wrong_answer |
|:------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
| llama-7b-turbomind | 1.30 | 2.61 | 0.00 | 15 | 28 | 843 | 266 | 15 | 14 | 290 | 257 | 0 | 14 | 553 | 9 |
| llama-13b-turbomind | 2.09 | 4.17 | 0.00 | 24 | 31 | 823 | 274 | 24 | 16 | 270 | 266 | 0 | 15 | 553 | 8 |
| llama-30b-turbomind | 3.48 | 6.78 | 0.17 | 40 | 41 | 780 | 291 | 39 | 25 | 226 | 286 | 1 | 16 | 554 | 5 |
| llama-65b-turbomind | 4.00 | 7.83 | 0.17 | 46 | 22 | 755 | 329 | 45 | 10 | 205 | 316 | 1 | 12 | 550 | 13 |
| llama-2-7b-turbomind | 0.78 | 1.57 | 0.00 | 9 | 28 | 825 | 290 | 9 | 16 | 274 | 277 | 0 | 12 | 551 | 13 |
| llama-2-13b-turbomind | 2.52 | 5.04 | 0.00 | 29 | 29 | 761 | 333 | 29 | 17 | 207 | 323 | 0 | 12 | 554 | 10 |
| llama-2-70b-turbomind | 5.04 | 9.57 | 0.52 | 58 | 47 | 684 | 363 | 55 | 28 | 140 | 353 | 3 | 19 | 544 | 10 |
| llama-3-8b-turbomind | 16.59 | 16.70 | 16.49 | 191 | 30 | 236 | 695 | 96 | 13 | 119 | 348 | 95 | 17 | 117 | 347 |
| llama-3-70b-turbomind | 38.49 | 38.43 | 38.54 | 443 | 2 | 120 | 587 | 221 | 2 | 58 | 295 | 222 | 0 | 62 | 292 |
| internlm2-1.8b-turbomind | 4.34 | 5.04 | 3.65 | 50 | 33 | 333 | 736 | 29 | 18 | 177 | 352 | 21 | 15 | 156 | 384 |
| internlm2-7b-turbomind | 12.16 | 12.52 | 11.81 | 140 | 41 | 166 | 805 | 72 | 23 | 92 | 389 | 68 | 18 | 74 | 416 |
| internlm2-20b-turbomind | 18.46 | 20.96 | 15.97 | 213 | 54 | 134 | 751 | 121 | 24 | 57 | 374 | 92 | 30 | 77 | 377 |
| qwen-1.8b-turbomind | 1.82 | 1.91 | 1.74 | 21 | 31 | 449 | 651 | 11 | 17 | 208 | 340 | 10 | 14 | 241 | 311 |
| qwen-7b-turbomind | 4.95 | 5.39 | 4.51 | 57 | 37 | 388 | 670 | 31 | 15 | 197 | 333 | 26 | 22 | 191 | 337 |
| qwen-14b-turbomind | 8.86 | 9.74 | 7.99 | 102 | 2 | 245 | 803 | 56 | 0 | 120 | 400 | 46 | 2 | 125 | 403 |
| qwen-72b-turbomind | 16.86 | 19.48 | 14.24 | 194 | 12 | 229 | 717 | 112 | 4 | 112 | 348 | 82 | 8 | 117 | 369 |
| qwen1.5-0.5b-hf | 0.87 | 0.52 | 1.22 | 10 | 29 | 499 | 614 | 3 | 10 | 259 | 304 | 7 | 19 | 240 | 310 |
| qwen1.5-1.8b-hf | 2.00 | 2.26 | 1.74 | 23 | 26 | 434 | 669 | 13 | 10 | 220 | 333 | 10 | 16 | 214 | 336 |
| qwen1.5-4b-hf | 5.65 | 6.96 | 4.34 | 65 | 37 | 349 | 701 | 40 | 19 | 161 | 356 | 25 | 18 | 188 | 345 |
| qwen1.5-7b-hf | 6.69 | 8.00 | 5.38 | 77 | 30 | 283 | 762 | 46 | 12 | 124 | 394 | 31 | 18 | 159 | 368 |
| qwen1.5-14b-hf | 12.69 | 13.74 | 11.63 | 146 | 43 | 232 | 731 | 79 | 22 | 122 | 353 | 67 | 21 | 110 | 378 |
| qwen1.5-32b-hf | 14.34 | 16.70 | 11.98 | 165 | 45 | 191 | 751 | 96 | 18 | 88 | 374 | 69 | 27 | 103 | 377 |
| qwen1.5-72b-hf | 15.29 | 15.65 | 14.93 | 176 | 11 | 242 | 723 | 90 | 7 | 118 | 361 | 86 | 4 | 124 | 362 |
| qwen1.5-moe-a2-7b-hf | 9.56 | 10.09 | 9.03 | 110 | 10 | 272 | 760 | 58 | 5 | 129 | 384 | 52 | 5 | 143 | 376 |
| mistral-7b-v0.1-hf | 11.38 | 11.83 | 10.94 | 131 | 30 | 221 | 770 | 68 | 11 | 100 | 397 | 63 | 19 | 121 | 373 |
| mistral-7b-v0.2-hf | 11.38 | 11.13 | 11.63 | 131 | 2 | 259 | 760 | 64 | 2 | 124 | 386 | 67 | 0 | 135 | 374 |
| mixtral-8x7b-v0.1-hf | 21.11 | 21.39 | 20.83 | 243 | 7 | 165 | 737 | 123 | 4 | 76 | 373 | 120 | 3 | 89 | 364 |
| mixtral-8x22b-v0.1-hf | 30.97 | 31.22 | 30.73 | 357 | 6 | 131 | 658 | 180 | 3 | 66 | 327 | 177 | 3 | 65 | 331 |
| yi-6b-hf | 2.43 | 2.78 | 2.08 | 28 | 7 | 456 | 661 | 16 | 2 | 214 | 344 | 12 | 5 | 242 | 317 |
| yi-34b-hf | 8.25 | 8.35 | 8.16 | 95 | 8 | 319 | 730 | 48 | 5 | 163 | 360 | 47 | 3 | 156 | 370 |
| deepseek-7b-base-hf | 5.30 | 5.22 | 5.38 | 61 | 7 | 325 | 759 | 30 | 4 | 165 | 377 | 31 | 3 | 160 | 382 |
| deepseek-67b-base-hf | 26.50 | 26.96 | 26.04 | 305 | 9 | 202 | 636 | 155 | 4 | 105 | 312 | 150 | 5 | 97 | 324 |
## Chat Models
| model | lcbench/pass@1 | en/pass@1 | cn/pass@1 | lcbench/pass | lcbench/timeout | lcbench/failed | lcbench/wrong_answer | en/pass | en/timeout | en/failed | en/wrong_answer | cn/pass | cn/timeout | cn/failed | cn/wrong_answer |
|:-----------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
| qwen1.5-0.5b-chat-hf | 0.00 | 0.00 | 0.00 | 0 | 0 | 1152 | 0 | 0 | 0 | 576 | 0 | 0 | 0 | 576 | 0 |
| qwen1.5-1.8b-chat-hf | 1.65 | 1.57 | 1.74 | 19 | 5 | 603 | 525 | 9 | 2 | 298 | 267 | 10 | 3 | 305 | 258 |
| qwen1.5-4b-chat-hf | 5.56 | 5.22 | 5.90 | 64 | 17 | 484 | 587 | 30 | 8 | 242 | 296 | 34 | 9 | 242 | 291 |
| qwen1.5-7b-chat-hf | 8.78 | 9.57 | 7.99 | 101 | 25 | 333 | 693 | 55 | 12 | 151 | 358 | 46 | 13 | 182 | 335 |
| qwen1.5-14b-chat-hf | 14.42 | 16.52 | 12.33 | 166 | 18 | 222 | 746 | 95 | 10 | 110 | 361 | 71 | 8 | 112 | 385 |
| qwen1.5-32b-chat-hf | 10.78 | 13.04 | 8.51 | 124 | 15 | 516 | 497 | 75 | 10 | 195 | 296 | 49 | 5 | 321 | 201 |
| qwen1.5-72b-chat-hf | 18.77 | 18.78 | 18.75 | 216 | 23 | 164 | 749 | 108 | 12 | 89 | 367 | 108 | 11 | 75 | 382 |
| qwen1.5-110b-chat-hf | 34.58 | 34.43 | 34.72 | 399 | 20 | 176 | 557 | 199 | 12 | 85 | 280 | 200 | 8 | 91 | 277 |
| internlm2-chat-1.8b-hf | 4.52 | 5.04 | 3.99 | 52 | 10 | 364 | 726 | 29 | 4 | 172 | 371 | 23 | 6 | 192 | 355 |
| internlm2-chat-1.8b-sft-hf | 3.56 | 3.83 | 3.30 | 41 | 12 | 403 | 696 | 22 | 6 | 211 | 337 | 19 | 6 | 192 | 359 |
| internlm2-chat-7b-hf | 14.60 | 13.74 | 15.45 | 168 | 12 | 238 | 734 | 79 | 7 | 142 | 348 | 89 | 5 | 96 | 386 |
| internlm2-chat-7b-sft-hf | 14.34 | 14.61 | 14.06 | 165 | 9 | 275 | 703 | 84 | 3 | 174 | 315 | 81 | 6 | 101 | 388 |
| internlm2-chat-20b-hf | 19.64 | 20.00 | 19.27 | 226 | 11 | 191 | 724 | 115 | 7 | 83 | 371 | 111 | 4 | 108 | 353 |
| internlm2-chat-20b-sft-hf | 20.55 | 19.91 | 21.18 | 237 | 11 | 195 | 709 | 115 | 6 | 94 | 361 | 122 | 5 | 101 | 348 |
| llama-3-8b-instruct-hf | 28.50 | 29.04 | 27.95 | 328 | 17 | 95 | 712 | 167 | 7 | 44 | 358 | 161 | 10 | 51 | 354 |
| llama-3-70b-instruct-hf | 45.44 | 46.09 | 44.79 | 523 | 8 | 52 | 569 | 265 | 2 | 25 | 284 | 258 | 6 | 27 | 285 |
| llama-3-8b-instruct-lmdeploy | 29.02 | 29.39 | 28.65 | 334 | 19 | 94 | 705 | 169 | 11 | 42 | 354 | 165 | 8 | 52 | 351 |
| llama-3-70b-instruct-lmdeploy | 44.66 | 46.78 | 42.53 | 514 | 11 | 44 | 583 | 269 | 5 | 19 | 283 | 245 | 6 | 25 | 300 |
| mistral-7b-instruct-v0.1-hf | 9.82 | 10.78 | 8.85 | 113 | 17 | 316 | 706 | 62 | 9 | 152 | 353 | 51 | 8 | 164 | 353 |
| mistral-7b-instruct-v0.2-hf | 7.90 | 6.26 | 9.55 | 91 | 8 | 572 | 481 | 36 | 4 | 345 | 191 | 55 | 4 | 227 | 290 |
| mixtral-8x7b-instruct-v0.1-hf | 16.29 | 15.91 | 16.67 | 188 | 13 | 370 | 581 | 92 | 6 | 241 | 237 | 96 | 7 | 129 | 344 |

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .lcbench_gen_5ff288 import LCBench_datasets # noqa: F401, F403

View File

@ -0,0 +1,107 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LCDataset, LCPassKEvaluator
LC_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='test_column')
LC_en_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
),
dict(role='BOT', prompt='[BEGIN]\n'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
LC_cn_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k1 <= k <= n你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 [\"a\",\"b\",\"c\"],我们需要按一次键来输入 \"a\",按两次键来输入 \"b\",按三次键来输入 \"c\"\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1*# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'
),
dict(role='BOT', prompt='[BEGIN]\n'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
LCBench_datasets = [
dict(
type=LCDataset,
abbr='lcbench_en',
path='./data/LCBench2023/LCBench2023.jsonl',
num_repeats=1,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_en_infer_cfg,
eval_cfg=LC_eval_cfg),
dict(
type=LCDataset,
abbr='lcbench_cn',
path='./data/LCBench2023/LCBench2023_cn.jsonl',
num_repeats=1,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_cn_infer_cfg,
eval_cfg=LC_eval_cfg)
]

View File

@ -0,0 +1,77 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LCDataset, LCPassKEvaluator
LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
LC_en_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
dict(role='BOT', prompt='[BEGIN]\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
LC_cn_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k1 <= k <= n你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 ["a","b","c"],我们需要按一次键来输入 "a",按两次键来输入 "b",按三次键来输入 "c"\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1*# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'),
dict(role='BOT', prompt='[BEGIN]\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
LCBench_datasets = []
for difficulty in LC_difficulties_list:
LCBench_datasets.append(
dict(
type=LCDataset,
abbr='lcbench_en-' + difficulty,
path='data/LCBench2023/LCBench2023.jsonl',
difficulty=difficulty,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_en_infer_cfg,
eval_cfg=LC_eval_cfg,
)
)
LCBench_datasets.append(
dict(
type=LCDataset,
abbr='lcbench_cn-' + difficulty,
path='data/LCBench2023/LCBench2023_cn.jsonl',
difficulty=difficulty,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_cn_infer_cfg,
eval_cfg=LC_eval_cfg,
)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .lcbench_repeat10_gen_5ff288 import LCBench_datasets_repeat10 # noqa: F401, F403

View File

@ -0,0 +1,106 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LCDataset, LCPassKEvaluator
LC_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='test_column')
LC_en_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
),
dict(role='BOT', prompt='[BEGIN]\n'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
LC_cn_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k1 <= k <= n你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt="你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 [\"a\",\"b\",\"c\"],我们需要按一次键来输入 \"a\",按两次键来输入 \"b\",按三次键来输入 \"c\"\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1*# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
),
dict(
role='BOT',
prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
),
dict(
role='HUMAN',
prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'
),
dict(role='BOT', prompt='[BEGIN]\n'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
LCBench_datasets_repeat10 = [
dict(
type=LCDataset,
abbr='lcbench_en_repeat10',
path='./data/LCBench2023/LCBench2023.jsonl',
num_repeats=10,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_en_infer_cfg,
eval_cfg=LC_eval_cfg),
dict(
type=LCDataset,
abbr='lcbench_cn_repeat10',
path='./data/LCBench2023/LCBench2023_cn.jsonl',
num_repeats=10,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_cn_infer_cfg,
eval_cfg=LC_eval_cfg)
]

View File

@ -0,0 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
Passage: {passage}
Question: {question}
A. Yes
B. NO
""".strip()
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -0,0 +1,47 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
from opencompass.utils.text_postprocessors import first_capital_postprocess
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
),
dict(role='BOT', prompt='{label}'),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
inferencer=dict(type=GenInferencer, max_out_len=50),
)
BoolQ_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_capital_postprocess),
)
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -33,7 +33,7 @@ BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='./data/SuperGLUE/BoolQ/val.jsonl',
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,

View File

@ -0,0 +1,43 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BoolQDatasetV2
BoolQ_reader_cfg = dict(
input_columns=['question', 'passage'],
output_column='label',
)
BoolQ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template={
'A':
dict(round=[
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
dict(role='BOT', prompt='Yes'),
]),
'B':
dict(round=[
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
dict(role='BOT', prompt='No'),
]),
},
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV2,
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,
)
]

View File

@ -35,7 +35,7 @@ BoolQ_datasets = [
dict(
abbr='BoolQ',
type=BoolQDatasetV3,
path='./data/SuperGLUE/BoolQ/val.jsonl',
path='opencompass/boolq',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,
eval_cfg=BoolQ_eval_cfg,

View File

@ -36,7 +36,7 @@ BoolQ_datasets = [
type=BoolQDataset,
abbr='BoolQ',
path='json',
data_files='./data/SuperGLUE/BoolQ/val.jsonl',
data_files='opencompass/boolq',
split='train',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,

View File

@ -36,7 +36,7 @@ BoolQ_datasets = [
type=BoolQDataset,
abbr='BoolQ',
path='json',
data_files='./data/SuperGLUE/BoolQ/val.jsonl',
data_files='opencompass/boolq',
split='train',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,

View File

@ -26,7 +26,7 @@ BoolQ_datasets = [
type=BoolQDataset,
abbr='BoolQ',
path='json',
data_files='./data/SuperGLUE/BoolQ/val.jsonl',
data_files='opencompass/boolq',
split='train',
reader_cfg=BoolQ_reader_cfg,
infer_cfg=BoolQ_infer_cfg,

View File

@ -0,0 +1,99 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
bbh_datasets = []
for _name in bbh_multiple_choice_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator_mcq),
pred_role='BOT',
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
for _name in bbh_free_form_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -0,0 +1,117 @@
# CaLM Lite
**CaLM Lite** is a lightweight version of CaLM.
**Ca**usal evaluation of **L**anguage **M**odels (CaLM), to the best of our knowledge, is the first comprehensive benchmark for evaluating the causal reasoning capabilities of language models. The CaLM framework establishes a foundational taxonomy consisting of four modules: causal target (i.e., what to evaluate), adaptation (i.e., how to obtain the results), metric (i.e., how to measure the results), and error (i.e., how to analyze the bad results).
<div align="center">
[🌐 Website](https://opencausalab.github.io/CaLM) |
[📃 Report](https://arxiv.org/abs/2405.00622) |[ 🎆 Github](https://github.com/OpenCausaLab/CaLM) | 📧 Welcome to join us by email at causalai@pjlab.org.cn
</div>
## Quick Start
### Data Preparation
Download dataset to data/ folder.
```
wget https://github.com/OpenCausaLab/CaLM/releases/download/v1.0.0.lite/calm.zip
unzip calm.zip
```
### Run Model and Infer
To obtain a concise output with only the average information for all tasks, use:
```
python run.py --models YOUR_MODEL --datasets calm --summarizer calm
```
If you want detailed information for each task, use:
```
python run.py --models YOUR_MODEL --datasets calm
```
The `--summarizer calm` flag in the first command is used to generate a summarized output, while omitting it in the second command will provide task-specific details.
## Available Causal Tasks
We provide 92 tasks for causal evaluation, stored in the `data/calm` folder. For more information about our causal tasks, refer to [tasks](https://github.com/OpenCausaLab/CaLM/blob/main/documents/tasks.md).
The directory structure is:
```
├── calm
| ├── association
| ├── causal_discovery # Rung of the causal ladder
| │ ├── abstract_reasoning # Causal scenario
| │ │ ├── AR-B_CaLM-AR_CN.json # Causal task
| │ | └── AR-B_CaLM-AR_EN.json # Causal task
| │ └── ...
| └── ...
└── ...
```
## Dataset
- **Dataset size**: CaLM Lite leverages a light dataset of **9200**, while CaLM uses a significantly larger dataset of 126,334. The table below details the English dataset composition, with the Chinese version structured identically.
- **Dataset configuration**: We prioritize balance in our dataset for **binary classification** and **choice selection** questions. By ensuring an equal number of each GT label, we minimize the risk of introducing bias into the model's testing. For **probability calculation**, CaLM-Lite takes extra attention to balance the number of problems across different causal reasoning processes. (For more details on how causal reasoning process is defined, please refer to Section 9.1.6 of the [paper](https://arxiv.org/abs/2405.00622).)
- **Efficient evaluation**: For enhanced evaluation efficiency, OpenCompass offers customizable methods. Refer to the [documentation](https://opencompass.org.cn/doc) for guidance on tailoring these methods to your needs.
| Causal ladder | Causal scenario | Subset | Question type | Mode | CaLM Lite | CaLM |
|---------------|-----------------|--------|---------------|------|-----------|------|
| Causal discovery | PCD | E-CARE | Binary classification | Natural | 100 | 2000 |
| Causal discovery | PCD | E-CARE | Choice selection | Natural | 100 | 1000 |
| Causal discovery | PCD | COPA | Binary classification | Natural | 100 | 2000 |
| Causal discovery | PCD | COPA | Choice selection | Natural | 100 | 1000 |
| Causal discovery | ECI | CTB | Binary classification | Natural | 100 | 596 |
| Causal discovery | ECI | ESC | Binary classification | Natural | 100 | 1000 |
| Causal discovery | ECI | MAVEN-ERE | Binary classification | Natural | 100 | 1000 |
| Causal discovery | AR | CaLM-AR | Binary classification | Symbolic | 100 | 1600 |
| Causal discovery | CA | FP | Binary classification | Symbolic | 100 | 1600 |
| Causal discovery | CA | FA | Binary classification | Symbolic | 100 | 1600 |
| Association | CORR | correlation | Binary classification | Natural | 100 | 1476 |
| Association | EAE | exp-away | Binary classification | Natural | 100 | 168 |
| Intervention | CB | collider-bias | Binary classification | Natural | 100 | 163 |
| Intervention | ATE | ATE-natural | Binary classification | Natural | 100 | 1600 |
| Intervention | ATE | ATE-basic | Probability calculation | Mathematical | 100 | 1600 |
| Intervention | ATE | ATE-hard | Probability calculation | Mathematical | 100 | 1600 |
| Intervention | CDE | CDE-natural | Binary classification | Natural | 100 | 1600 |
| Intervention | CDE | CDE-basic | Probability calculation | Mathematical | 100 | 1600 |
| Intervention | CDE | CDE-hard | Probability calculation | Mathematical | 100 | 1600 |
| Intervention | BAS | backadj | Binary classification | Natural | 100 | 227 |
| Intervention | BAS | max-BAS | Choice selection | Symbolic | 100 | 1600 |
| Intervention | BAS | min-BAS | Choice selection | Symbolic | 100 | 1600 |
| Intervention | BAS | mix-BAS | Choice selection | Symbolic | 100 | 1600 |
| Intervention | FAS | FAS | Choice selection | Symbolic | 100 | 1600 |
| Intervention | IV | CaLM-IV | Choice selection | Symbolic | 100 | 1600 |
| Intervention | CEI | 0.2-UC | Binary classification | Symbolic | 100 | 1600 |
| Intervention | CEI | 0.4-UC | Binary classification | Symbolic | 100 | 1600 |
| Intervention | CEI | 0.6-UC | Binary classification | Symbolic | 100 | 1600 |
| Intervention | CEI | 0.8-UC | Binary classification | Symbolic | 100 | 1600 |
| Counterfactuals | ETT | ETT-natural | Binary classification | Natural | 100 | 1600 |
| Counterfactuals | ETT | ETT-basic | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | ETT | ETT-hard | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | NDE | NDE-natural | Binary classification | Natural | 100 | 1600 |
| Counterfactuals | NDE | NDE-basic | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | NDE | NDE-hard | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | NIE | NIE-natural | Binary classification | Natural | 100 | 1600 |
| Counterfactuals | NIE | NIE-basic | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | NIE | NIE-hard | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | PN | PN-basic | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | PN | PN-hard | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | PS | PS-basic | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | PS | PS-hard | Probability calculation | Mathematical | 100 | 1600 |
| Counterfactuals | AC | causal judgement | Binary classification | Natural | 100 | 187 |
| Counterfactuals | CR | CRASS | Choice selection | Natural | 100 | 274 |
| Counterfactuals | CR | det-counterfactual | Binary classification | Natural | 100 | 1476 |
| Counterfactuals | CEG | E-CARE | Open-ended generation | Natural | 100 | 1000 |
| **Total** | | | | | 4600 | 63167 |
## Available Prompt Styles (Adaptation)
Basic Prompt is our default setting for efficient evaluation of CaLM Lite, but we provide flexibility for exploring additional prompts through CaLM. If you'd like to explore and compare a wider range of prompts, we encourage you to use CaLM. We provide a comprehensive and easy-to-follow guide to assist you in our [repository](https://github.com/OpenCausaLab/CaLM).
## Citation
```
@misc{chen2024causal,
title={Causal Evaluation of Language Models},
author={Sirui Chen and Bo Peng and Meiqi Chen and Ruiqi Wang and Mengying Xu and Xingyu Zeng and Rui Zhao and Shengjie Zhao and Yu Qiao and Chaochao Lu},
year={2024},
eprint={2405.00622},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```

View File

@ -0,0 +1,160 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CaLMDataset, CaLMEvaluator
task_hiearchy_dict = {
# association/
# correlation/
'CORR-B_correlation_CN':'association/correlation/',
'CORR-B_correlation_EN':'association/correlation/',
# explaining_away_effect/
'EAE-B_exp-away_CN':'association/explaining_away_effect/',
'EAE-B_exp-away_EN':'association/explaining_away_effect/',
# causal_discovery/
# abstract_reasoning/
'AR-B_CaLM-AR_CN':'causal_discovery/abstract_reasoning/',
'AR-B_CaLM-AR_EN':'causal_discovery/abstract_reasoning/',
# causal_attribution/
'CA-B_FA_CN':'causal_discovery/causal_attribution/',
'CA-B_FA_EN':'causal_discovery/causal_attribution/',
'CA-B_FP_CN':'causal_discovery/causal_attribution/',
'CA-B_FP_EN':'causal_discovery/causal_attribution/',
# event_causality_identification/
'ECI-B_CTB_CN':'causal_discovery/event_causality_identification/',
'ECI-B_CTB_EN':'causal_discovery/event_causality_identification/',
'ECI-B_ESC_CN':'causal_discovery/event_causality_identification/',
'ECI-B_ESC_EN':'causal_discovery/event_causality_identification/',
'ECI-B_MAVEN-ERE_CN':'causal_discovery/event_causality_identification/',
'ECI-B_MAVEN-ERE_EN':'causal_discovery/event_causality_identification/',
# pairwise_causal_discovery/
'PCD-B_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
'PCD-B_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
'PCD-B_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
'PCD-B_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
'PCD-C_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
'PCD-C_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
'PCD-C_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
'PCD-C_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
# counterfactual/
# actual_causality/
'AC-B_causal_judgement_CN':'counterfactual/actual_causality/',
'AC-B_causal_judgement_EN':'counterfactual/actual_causality/',
# causal_explanation_generation/
'CEG-O_E-CARE_CN':'counterfactual/causal_explanation_generation/',
'CEG-O_E-CARE_EN':'counterfactual/causal_explanation_generation/',
# counterfactual_reasoning/
'CR-B_det-counterfactual_CN':'counterfactual/counterfactual_reasoning/',
'CR-B_det-counterfactual_EN':'counterfactual/counterfactual_reasoning/',
'CR-C_CRASS_CN':'counterfactual/counterfactual_reasoning/',
'CR-C_CRASS_EN':'counterfactual/counterfactual_reasoning/',
# effect_of_the_treatment_on_the_treated/
'ETT-B_ETT-natural_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
'ETT-B_ETT-natural_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
'ETT-P_ETT-basic_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
'ETT-P_ETT-basic_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
'ETT-P_ETT-hard_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
'ETT-P_ETT-hard_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
# natural_direct_effect/
'NDE-B_NDE-natural_CN':'counterfactual/natural_direct_effect/',
'NDE-B_NDE-natural_EN':'counterfactual/natural_direct_effect/',
'NDE-P_NDE-basic_CN':'counterfactual/natural_direct_effect/',
'NDE-P_NDE-basic_EN':'counterfactual/natural_direct_effect/',
'NDE-P_NDE-hard_CN':'counterfactual/natural_direct_effect/',
'NDE-P_NDE-hard_EN':'counterfactual/natural_direct_effect/',
# natural_indirect_effect/
'NIE-B_NIE-natural_CN':'counterfactual/natural_indirect_effect/',
'NIE-B_NIE-natural_EN':'counterfactual/natural_indirect_effect/',
'NIE-P_NIE-basic_CN':'counterfactual/natural_indirect_effect/',
'NIE-P_NIE-basic_EN':'counterfactual/natural_indirect_effect/',
'NIE-P_NIE-hard_CN':'counterfactual/natural_indirect_effect/',
'NIE-P_NIE-hard_EN':'counterfactual/natural_indirect_effect/',
# probability_of_necessity/
'PN-P_PN-basic_CN':'counterfactual/probability_of_necessity/',
'PN-P_PN-basic_EN':'counterfactual/probability_of_necessity/',
'PN-P_PN-hard_CN':'counterfactual/probability_of_necessity/',
'PN-P_PN-hard_EN':'counterfactual/probability_of_necessity/',
# probability_of_sufficiency/
'PS-P_PS-basic_CN':'counterfactual/probability_of_sufficiency/',
'PS-P_PS-basic_EN':'counterfactual/probability_of_sufficiency/',
'PS-P_PS-hard_CN':'counterfactual/probability_of_sufficiency/',
'PS-P_PS-hard_EN':'counterfactual/probability_of_sufficiency/',
# intervention/
# average_treatment_effect/
'ATE-B_ATE-natural_CN':'intervention/average_treatment_effect/',
'ATE-B_ATE-natural_EN':'intervention/average_treatment_effect/',
'ATE-P_ATE-basic_CN':'intervention/average_treatment_effect/',
'ATE-P_ATE-basic_EN':'intervention/average_treatment_effect/',
'ATE-P_ATE-hard_CN':'intervention/average_treatment_effect/',
'ATE-P_ATE-hard_EN':'intervention/average_treatment_effect/',
# backdoor_adjustment_set/
'BAS-B_backadj_CN':'intervention/backdoor_adjustment_set/',
'BAS-B_backadj_EN':'intervention/backdoor_adjustment_set/',
'BAS-C_max-BAS_CN':'intervention/backdoor_adjustment_set/',
'BAS-C_max-BAS_EN':'intervention/backdoor_adjustment_set/',
'BAS-C_min-BAS_CN':'intervention/backdoor_adjustment_set/',
'BAS-C_min-BAS_EN':'intervention/backdoor_adjustment_set/',
'BAS-C_mix-BAS_CN':'intervention/backdoor_adjustment_set/',
'BAS-C_mix-BAS_EN':'intervention/backdoor_adjustment_set/',
# causal_effect_identification/
'CEI-B_0.2-UC_CN':'intervention/causal_effect_identification/',
'CEI-B_0.2-UC_EN':'intervention/causal_effect_identification/',
'CEI-B_0.4-UC_CN':'intervention/causal_effect_identification/',
'CEI-B_0.4-UC_EN':'intervention/causal_effect_identification/',
'CEI-B_0.6-UC_CN':'intervention/causal_effect_identification/',
'CEI-B_0.6-UC_EN':'intervention/causal_effect_identification/',
'CEI-B_0.8-UC_CN':'intervention/causal_effect_identification/',
'CEI-B_0.8-UC_EN':'intervention/causal_effect_identification/',
# collider_bias/
'CB-B_collider-bias_CN':'intervention/collider_bias/',
'CB-B_collider-bias_EN':'intervention/collider_bias/',
# controlled_direct_effect/
'CDE-B_CDE-natural_CN':'intervention/controlled_direct_effect/',
'CDE-B_CDE-natural_EN':'intervention/controlled_direct_effect/',
'CDE-P_CDE-basic_CN':'intervention/controlled_direct_effect/',
'CDE-P_CDE-basic_EN':'intervention/controlled_direct_effect/',
'CDE-P_CDE-hard_CN':'intervention/controlled_direct_effect/',
'CDE-P_CDE-hard_EN':'intervention/controlled_direct_effect/',
# frontdoor_adjustment_set/
'FAS-C_FAS_CN':'intervention/frontdoor_adjustment_set/',
'FAS-C_FAS_EN':'intervention/frontdoor_adjustment_set/',
# instrumental_variable/
'IV-C_CaLM-IV_CN':'intervention/instrumental_variable/',
'IV-C_CaLM-IV_EN':'intervention/instrumental_variable/',}
calm_reader_cfg = dict(
input_columns=['question'],
output_column='gt_item')
calm_all_sets = list(set(key[:-3] for key in task_hiearchy_dict.keys()))
calm_datasets = []
for _name in calm_all_sets:
for _prompt_style in ['basic','basic-CN']:
_task_name = _name + ('_CN' if _prompt_style.endswith('-CN') else '_EN')
_path = f'./data/calm/{task_hiearchy_dict[_task_name]}{_task_name}.json'
calm_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{question}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=500))
calm_eval_cfg = dict(evaluator=dict(
type=CaLMEvaluator,
core_metrics=True,
error_analysis=True,
prompt_style=_prompt_style,
task=_task_name))
calm_datasets.append(
dict(
abbr=f'calm_{_task_name}',
type=CaLMDataset,
path=_path,
prompt_style=_prompt_style,
reader_cfg=calm_reader_cfg,
infer_cfg=calm_infer_cfg,
eval_cfg=calm_eval_cfg)
)
del _prompt_style, _task_name, _path, _name

View File

@ -0,0 +1,108 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import CEvalDataset
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val', 'test']:
for _name in ceval_all_sets:
_ch_name = ceval_subject_mapping[_name][1]
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
answer: dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
),
dict(role='BOT', prompt=answer),
])
for answer in ['A', 'B', 'C', 'D']
},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
ceval_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='./data/ceval_internal/formal_ceval',
local_mode=True,
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split),
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
del _split, _name, _ch_name

View File

@ -0,0 +1,130 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern,
# answer_pattern=r'(?i)答案\s*:\s*([A-D])'
answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
)
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
))
del _name, _ch_name

View File

@ -0,0 +1,115 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import commonsenseqaDataset
from opencompass.utils.text_postprocessors import (
match_answer_pattern,
)
commonsenseqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
output_column='answerKey',
test_split='validation',
)
_ice_template = dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt='Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: A.shirt pocket B.calligraphers hand C.inkwell D.desk drawer E.blotter',
),
dict(
role='BOT',
prompt='A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink. So the answer is E.',
),
dict(
role='HUMAN',
prompt='Q: What home entertainment equipment requires cable?Answer Choices: A.radio shack B.substation C.television D.cabinet',
),
dict(
role='BOT',
prompt='A: The answer must require cable. Of the above choices, only television requires cable. So the answer is C.',
),
dict(
role='HUMAN',
prompt='Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: A.pretty flowers B.hen house C.natural habitat D.storybook',
),
dict(
role='BOT',
prompt='A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the answer is B.',
),
dict(
role='HUMAN',
prompt='Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: A.populated areas B.race track C.desert D.apartment E.roadblock',
),
dict(
role='BOT',
prompt='A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of people. So the answer is A.',
),
dict(
role='HUMAN',
prompt='Q: Where do you put your grapes just before checking out? Answer Choices: A.mouth B.grocery cart Csuper market D.fruit basket E.fruit market',
),
dict(
role='BOT',
prompt='A: The answer should be the place where grocery items are placed before checking out. Of the above choices, grocery cart makes the most sense for holding grocery items. So the answer is B.',
),
dict(
role='HUMAN',
prompt='Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: A.united states B.mexico C.countryside D.atlas',
),
dict(
role='BOT',
prompt='A: The answer must be something that used to do what Google Maps and GPS services do, which is to give directions. Of the above choices, only atlases are used to give directions. So the answer is D.',
),
dict(
role='HUMAN',
prompt='Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: A.harder B.anguish C.bitterness D.tears E.sadness',
),
dict(
role='BOT',
prompt='A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above choices, the closest feeling is bitterness. So the answer is C.',
),
dict(
role='HUMAN',
prompt='Q:{question} Answer Choices: A. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nA:',
),
dict(
role='BOT',
prompt='{answerKey}',
),
],
),
ice_token='</E>',
)
commonsenseqa_infer_cfg = dict(
ice_template=_ice_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
commonsenseqa_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern, answer_pattern=r'(?i)so the answer is\s*([A-P])'
),
)
commonsenseqa_datasets = [
dict(
abbr='commonsense_qa',
type=commonsenseqaDataset,
path='opencompass/commonsense_qa',
reader_cfg=commonsenseqa_reader_cfg,
infer_cfg=commonsenseqa_infer_cfg,
eval_cfg=commonsenseqa_eval_cfg,
)
]
del _ice_template

View File

@ -0,0 +1,181 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, HumanEvalPlusEvaluator, humaneval_postprocess_v2
from opencompass.datasets import LCDataset, LCPassKEvaluator
from opencompass.datasets import TACODataset, TACOEvaluator
compassbench_v1_3_code_datasets = []
# --------------------------------------------------------------- HumanEval CN ---------------------------------------------------------------
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='完成以下Python代码任务:\n{prompt}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
compassbench_v1_3_code_datasets.append(
dict(
abbr='compass_bench_cdoe_completion_zh',
type=HumanevalDataset,
path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_zh.jsonl',
# local_mode=True,
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg,
)
)
# --------------------------------------------------------------- ---------------------------------------------------------------
humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Complete the following python code:\n{prompt}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalPlusEvaluator),
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
compassbench_v1_3_code_datasets.append(
dict(
abbr='compass_bench_cdoe_completion_en',
type=HumanevalDataset,
path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_en.jsonl',
# local_mode=True,
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg,
)
)
# ------------------------------------- Code Interview(LCBench --------------------------------------
LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
LC_en_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
dict(role='BOT', prompt='[BEGIN]\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
LC_cn_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k1 <= k <= n你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 ["a","b","c"],我们需要按一次键来输入 "a",按两次键来输入 "b",按三次键来输入 "c"\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1*# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'),
dict(role='BOT', prompt='[BEGIN]\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
for difficulty in LC_difficulties_list:
compassbench_v1_3_code_datasets.append(
dict(
type=LCDataset,
abbr='compass_bench_code_interview_en-' + difficulty,
path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_en.jsonl',
difficulty=difficulty,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_en_infer_cfg,
eval_cfg=LC_eval_cfg,
)
)
compassbench_v1_3_code_datasets.append(
dict(
type=LCDataset,
abbr='compass_bench_code_interview_zh-' + difficulty,
path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_zh.jsonl',
difficulty=difficulty,
reader_cfg=LC_reader_cfg,
infer_cfg=LC_cn_infer_cfg,
eval_cfg=LC_eval_cfg,
)
)
# --------------------------------------------Code Competition(TACO) ---------------------------------------------------------------
TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
TACO_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
for difficulty in TACO_difficulties_list:
compassbench_v1_3_code_datasets.append(
dict(
type=TACODataset,
abbr='TACO-' + difficulty,
path='./data/compassbench_v1_3/coding/compass_bench_code_competition',
difficulty=difficulty,
reader_cfg=TACO_reader_cfg,
infer_cfg=TACO_infer_cfg,
eval_cfg=TACO_eval_cfg,
)
)

View File

@ -0,0 +1,94 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets.compassbench_obj import (
CompassBenchObjectiveV1_3,
compassbench_objective_v1_3_postprocess,
)
from opencompass.utils.text_postprocessors import first_option_postprocess
prompt_cn = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
'cloze_cn': '以下是一道填空题,请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
}
prompt_en = {
'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option letter you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
}
douknow_sets = {
'wiki_en_sub_500_人文科学':['single_choice_en'],
'wiki_en_sub_500_社会科学':['single_choice_en'],
'wiki_en_sub_500_生活常识':['single_choice_en'],
'wiki_en_sub_500_自然科学-工科':['single_choice_en'],
'wiki_en_sub_500_自然科学-理科':['single_choice_en'],
'wiki_zh_sub_500_人文科学': ['single_choice_cn'],
'wiki_zh_sub_500_社会科学': ['single_choice_cn'],
'wiki_zh_sub_500_生活常识': ['single_choice_cn'],
'wiki_zh_sub_500_自然科学-工科':['single_choice_cn'],
'wiki_zh_sub_500_自然科学-理科':['single_choice_cn'],
}
data_path = './data/compassbench_v1_3/knowledge'
# Set up the prompts
CircularEval = True
compassbench_knowledge_datasets = []
for _split in list(douknow_sets.keys()):
for _name in douknow_sets[_split]:
if 'cn' in _name:
single_choice_prompts = prompt_cn
cloze_prompts = prompt_cn
else:
single_choice_prompts = prompt_en
cloze_prompts = prompt_en
if 'single_choice' in _name:
template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
dataset_name = _name + '_circular' if CircularEval else _name
dataset_abbr = (
'compassbench-' + _split + '_circular'
if CircularEval
else 'compassbench-' + _split
)
else:
template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
pred_postprocessor = dict(
type=compassbench_objective_v1_3_postprocess, name=_name
)
evaluator = dict(type=AccEvaluator)
dataset_name = _name
dataset_abbr = 'compassbench-' + _split
douknow_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate, template=dict(round=template_round)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
douknow_eval_cfg = dict(
evaluator=evaluator,
pred_postprocessor=pred_postprocessor,
)
compassbench_knowledge_datasets.append(
dict(
type=CompassBenchObjectiveV1_3,
path=f'{data_path}/{_split}.jsonl',
name=dataset_name,
abbr=dataset_abbr,
reader_cfg=dict(input_columns=['question'], output_column='answer'),
infer_cfg=douknow_infer_cfg,
eval_cfg=douknow_eval_cfg,
)
)
del _split, _name

View File

@ -0,0 +1,86 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
from opencompass.utils.text_postprocessors import first_option_postprocess
prompt_cn = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
'cloze_cn': '以下是一道数学计算题,请你一步一步计算,并在最后用\\boxed{}包裹并返回你计算的最终答案。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
}
prompt_en = {
'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
'cloze_en': 'Here is a arithematic problem. Please reason step by step, and put your final answer within \\boxed{}. Here is the question you need to answer:\n{question}\nLet\'s solve this problem step by step:',
}
douknow_sets = {
'arithmetic_cloze_en': ['cloze_en'],
'college_single_choice_en': ['single_choice_en'],
'college_single_choice_cn': ['single_choice_cn'],
}
data_path = './data/compassbench_v1_3/math'
# Set up the prompts
CircularEval = True
compassbench_math_datasets = []
for _split in list(douknow_sets.keys()):
for _name in douknow_sets[_split]:
if 'cn' in _name:
single_choice_prompts = prompt_cn
cloze_prompts = prompt_cn
else:
single_choice_prompts = prompt_en
cloze_prompts = prompt_en
if 'single_choice' in _name:
template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
dataset_name = _name + '_circular' if CircularEval else _name
dataset_abbr = (
'compassbench-' + _split + '_circular'
if CircularEval
else 'compassbench-' + _split
)
else:
template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
pred_postprocessor = dict(
type=math_postprocess_v2,
)
evaluator = dict(type=MATHEvaluator)
dataset_name = _name
dataset_abbr = 'compassbench-' + _split
douknow_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate, template=dict(round=template_round)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
douknow_eval_cfg = dict(
evaluator=evaluator,
pred_postprocessor=pred_postprocessor,
)
compassbench_math_datasets.append(
dict(
type=CompassBenchObjectiveV1_3,
path=f'{data_path}/{_split}.jsonl',
name=dataset_name,
abbr=dataset_abbr,
reader_cfg=dict(input_columns=['question'], output_column='answer'),
infer_cfg=douknow_infer_cfg,
eval_cfg=douknow_eval_cfg,
)
)
del _split, _name

View File

@ -0,0 +1,44 @@
FORCE_STOP_PROMPT_EN = (
"""You should directly give results based on history information."""
)
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = """\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method."""

View File

@ -1,7 +1,7 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GPQASimpleEvalDataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
# openai_simple_eval prompt
align_prompt = """
@ -43,7 +43,7 @@ for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQASimpleEvalDataset,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,

View File

@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MMLUProDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
with read_base():
from .mmlu_pro_categories import categories
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{options_str}
""".strip()
mmlu_pro_datasets = []
for category in categories:
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN',
prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern,
answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))

View File

@ -51,6 +51,7 @@ for category in categories:
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,

View File

@ -0,0 +1,68 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import RaceDataset
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
Article: {article}
Q: {question}
A. {A}
B. {B}
C. {C}
D. {D}
""".strip()
race_reader_cfg = dict(
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='validation',
test_split='test',
)
race_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
race_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
pred_role='BOT',
)
race_datasets = [
dict(
abbr='race-middle',
type=RaceDataset,
path='opencompass/race',
name='middle',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
dict(
abbr='race-high',
type=RaceDataset,
path='opencompass/race',
name='high',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
]

View File

@ -0,0 +1,53 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import RaceDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
race_reader_cfg = dict(
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='validation',
test_split='test'
)
race_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(role='HUMAN', prompt='Read the article, and answer the question by replying A, B, C or D.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:'),
dict(role='BOT', prompt='{answer}'),
]
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4]),
inferencer=dict(type=GenInferencer, max_out_len=50),
)
race_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess),
pred_role='BOT')
race_datasets = [
dict(
abbr='race-middle',
type=RaceDataset,
path='opencompass/race',
name='middle',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg),
dict(
abbr='race-high',
type=RaceDataset,
path='opencompass/race',
name='high',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg)
]

View File

@ -33,6 +33,7 @@ instruction_generalization_eng_eval_cfg = dict(
instruction_generalization_eng_datasets = [
dict(
abbr='RoleBench_instruct_eng',
type=InstructionGeneralizationEnglishDataset,
path='ZenMoore/RoleBench',
reader_cfg=instruction_generalization_eng_reader_cfg,

View File

@ -33,6 +33,7 @@ instruction_generalization_zh_eval_cfg = dict(
instruction_generalization_zh_datasets = [
dict(
abbr='RoleBench_instruct_zh',
type=InstructionGeneralizationChineseDataset,
path='ZenMoore/RoleBench',
reader_cfg=instruction_generalization_zh_reader_cfg,

View File

@ -33,6 +33,7 @@ role_generalization_eng_eval_cfg = dict(
role_generalization_eng_datasets = [
dict(
abbr='RoleBench_role_eng',
type=RoleGeneralizationEnglishDataset,
path='ZenMoore/RoleBench',
reader_cfg=role_generalization_eng_reader_cfg,

View File

@ -0,0 +1,14 @@
# Ruler
OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations.
OpenCompass have providied two types of evaluation demo for using different tokenizers.
For using the same tokenizer (typicall GPT-4), you can follow the demo (configs/eval_ruler_fix_tokenizer.py) where most of the settings are already defined.
For evaluation using each model's own tokenizer, you have to build the settings when you run the demo (we do not know which model you are trying to evaluate!) you can create a new evaluation script following the example (configs/eval_ruler.py) and change the context window sizes or add models according to your settings.
```bash
python run.py configs/eval_ruler_fix_tokenizer.py # For evaluation with GPT-4 tokenizer
python run.py configs/eval_ruler.py # For evaluation with model's tokenizer
```

View File

@ -0,0 +1,28 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,29 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,29 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,29 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,28 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,29 @@
from mmengine.config import read_base
with read_base():
from .ruler_niah_gen import niah_datasets # Niah
from .ruler_vt_gen import vt_datasets # VT
from .ruler_fwe_gen import fwe_datasets # FWE
from .ruler_cwe_gen import cwe_datasets # CWE
from .ruler_qa_gen import qa_datasets # QA
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
# Change the context lengths to be tested
max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k']
ruler_datasets = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
ruler_datasets.append(tmp_dataset)

View File

@ -0,0 +1,13 @@
from mmengine.config import read_base
with read_base():
from .ruler_4k_gen import ruler_datasets as ruler_4k_datasets
from .ruler_8k_gen import ruler_datasets as ruler_8k_datasets
from .ruler_16k_gen import ruler_datasets as ruler_16k_datasets
from .ruler_32k_gen import ruler_datasets as ruler_32k_datasets
from .ruler_128k_gen import ruler_datasets as ruler_128k_datasets
from .ruler_1m_gen import ruler_datasets as ruler_1m_datasets
ruler_combined_datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')), []
)

View File

@ -0,0 +1,34 @@
from opencompass.datasets.ruler.ruler_cwe import RulerCweDataset
from opencompass.datasets.ruler.ruler_cwe import RulerCweEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
# CWE Dataset
cwe_datasets = [
{
'abbr': 'ruler_cwe',
'type': RulerCweDataset,
'freq_cw': 30,
'freq_ucw': 3,
'num_cw': 10,
'tokens_to_generate': 120,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=RulerCweEvaluator),
),
}
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_fwe import RulerFweDataset
from opencompass.datasets.ruler.ruler_fwe import RulerFweEvaluator
# FWE Dataset
fwe_datasets = [
{
'abbr': 'ruler_fwe',
'type': RulerFweDataset,
'tokens_to_generate': 50,
'alpha': 2.0,
'coded_wordlen': 6,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=RulerFweEvaluator),
),
}
]

View File

@ -0,0 +1,123 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
# Ruler Dataset settings
niah_configurations = [
{
'abbr': 'single_1',
'type_haystack': 'repeat',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'single_2',
'type_haystack': 'essay',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'single_3',
'type_haystack': 'essay',
'type_needle_k': 'words',
'type_needle_v': 'uuids',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'multikey_1',
'type_haystack': 'essay',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 4,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'multikey_2',
'type_haystack': 'needle',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'multikey_3',
'type_haystack': 'needle',
'type_needle_k': 'uuids',
'type_needle_v': 'uuids',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 1,
},
{
'abbr': 'multivalue',
'type_haystack': 'essay',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 1,
'num_needle_v': 4,
'num_needle_q': 1,
},
{
'abbr': 'multiquery',
'type_haystack': 'essay',
'type_needle_k': 'words',
'type_needle_v': 'numbers',
'num_needle_k': 1,
'num_needle_v': 1,
'num_needle_q': 4,
},
]
niah_datasets = []
# NIAH Dataset
base_path = './data/ruler'
file_path = 'PaulGrahamEssays.jsonl'
for index, config in enumerate(niah_configurations):
dataset_dict = {
'abbr': f'ruler_niah_{config["abbr"]}',
'type': RulerNiahDataset,
'base_path': base_path,
'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'],
'num_needle_k': config['num_needle_k'],
'num_needle_v': config['num_needle_v'],
'num_needle_q': config['num_needle_q'],
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=RulerNiahEvaluator),
),
}
niah_datasets.append(dataset_dict)

View File

@ -0,0 +1,38 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_qa import RulerQaDataset
from opencompass.datasets.ruler.ruler_qa import RulerQaEvaluator
qa_configurations = [
{'dataset': 'squad', 'path': './data/ruler/dev-v2.0.json'},
{'dataset': 'hotpotqa', 'path': './data/ruler/hotpotqa.json'},
]
qa_datasets = []
for index, config in enumerate(qa_configurations):
dataset_dict = {
'abbr': f'ruler_qa_{config["dataset"]}',
'dataset': config['dataset'],
'path': config['path'],
'type': RulerQaDataset,
'tokens_to_generate': 50,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=RulerQaEvaluator),
),
}
qa_datasets.append(dataset_dict)

View File

@ -0,0 +1,32 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_vt import RulerVtDataset
from opencompass.datasets.ruler.ruler_vt import RulerVtEvaluator
# VT Dataset
vt_datasets = [
{
'abbr': 'ruler_vt',
'type': RulerVtDataset,
'num_chains': 1,
'num_hops': 4,
'reader_cfg': dict(input_columns=['prompt'], output_column='answer'),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
),
'eval_cfg': dict(
evaluator=dict(type=RulerVtEvaluator),
),
}
]

View File

@ -0,0 +1,31 @@
# SciCode: A Research Coding Benchmark Curated by Scientists
## Introduction
SciCode is a challenging benchmark designed to evaluate the capabilities of language models (LMs) in generating code for solving realistic scientific research problems. It has a diverse coverage of 16 subdomains from 6 domains: Physics, Math, Material Science, Biology, and Chemistry. Unlike previous benchmarks that consist of exam-like question-answer pairs, SciCode is converted from real research problems. SciCode problems naturally factorize into multiple subproblems, each involving knowledge recall, reasoning, and code synthesis. In total, SciCode contains 338 subproblems decomposed from 80 challenging main problems, and it offers optional descriptions specifying useful scientific background information and scientist-annotated gold-standard solutions and test cases for evaluation. Claude3.5-Sonnet, the best-performing model among those tested, can solve only 4.6% of the problems in the most realistic setting. Broadly, SciCode demonstrates a realistic and scientists' everyday workflow of identifying critical science concepts and facts and then transforming them into computation and simulation code. We believe SciCode not only helps demonstrate contemporary LLMs' progress towards helpful assistant for scientists but also helps shed light on future building and evaluation of scientific AI. For more detailed information, please refer to https://scicode-bench.github.io/.
## How to Use
By modifying the with_bg parameter in the configuration file, you can support setup for w/ background evaluation.
```bash
python run.py --datasets scicode_gen --hf-num-gpus 1 --hf-type chat --hf-path meta-llama/Meta-Llama-3-8B-Instruct --debug --model-kwargs device_map='auto' trust_remote_code=True --batch-size 1
```
## Reference Performance
| Model | Condition | Subproblem Accuracy | Main Problem Accuracy |
|---------------------------|--------------|---------------------|-----------------------|
| Llama-3-70B-Instruct | w/o Background | 21.53% | 4.62% |
| Llama-3-70B-Instruct | w/ Background | 24.31% | 7.69% |
| Qwen2-72B-Instruct | w/o Background | 16.67% | 1.54% |
| Qwen2-72B-Instruct | w/ Background | 19.79% | 1.54% |
## Citation
```
@misc{tian2024scicode,
title={SciCode: A Research Coding Benchmark Curated by Scientists},
author={Minyang Tian and Luyu Gao and Shizhuo Dylan Zhang and Xinan Chen and Cunwei Fan and Xuefei Guo and Roland Haas and Pan Ji and Kittithat Krongchon and Yao Li and Shengyan Liu and Di Luo and Yutao Ma and Hao Tong and Kha Trinh and Chenyu Tian and Zihan Wang and Bohao Wu and Yanyu Xiong and Shengzhu Yin and Minhui Zhu and Kilian Lieret and Yanxin Lu and Genglin Liu and Yufeng Du and Tianhua Tao and Ofir Press and Jamie Callan and Eliu Huerta and Hao Peng},
year={2024},
eprint={2407.13168},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
```

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .scicode_gen_085b98 import SciCode_datasets # noqa: F401, F403

View File

@ -0,0 +1,29 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer
from opencompass.datasets import SciCodeDataset, SciCodeEvaluator
SciCode_reader_cfg = dict(input_columns=['prompt'], output_column=None)
SciCode_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template='',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, infer_mode='every', max_out_len=4096))
SciCode_eval_cfg = dict(evaluator=dict(type=SciCodeEvaluator, dataset_path='./data/scicode', with_bg=False))
SciCode_datasets = [
dict(
abbr='SciCode',
type=SciCodeDataset,
path='./data/scicode',
with_bg=False,
reader_cfg=SciCode_reader_cfg,
infer_cfg=SciCode_infer_cfg,
eval_cfg=SciCode_eval_cfg)
]

View File

@ -150,5 +150,5 @@ for _name, _prompt in sub_map.items():
infer_order='double',
base_models=gpt4,
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -23,7 +23,6 @@ subjective_all_sets = {
'coding/compass_bench_coding_cn_val',
],
}
data_path = './data/compassbench_v1_3/'
pair_prompt_en = """# Instruction
@ -184,7 +183,7 @@ pair_prompt_cn = """# 指令
checklist_datasets = []
gpt4 = [
dict(
abbr='gpt4o',
abbr='gpt4-1106',
)
]
for lan, data_name_list in subjective_all_sets.items():

View File

@ -0,0 +1,99 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
'fofo_test_prompts'
]
base_prompt = """

View File

@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
)
data_path ='./data/WildBench/wildbench.jsonl'
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
@ -54,11 +54,11 @@ wildbench_datasets.append(
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude]

View File

@ -18,19 +18,16 @@ truthfulqa_infer_cfg = dict(
inferencer=dict(type=GenInferencer))
# Metrics such as 'truth' and 'info' needs
# OPENAI_API_KEY with finetuned models in it.
# Please use your own finetuned openai model with keys and refers to
# extra judge models.
# Please use your own finetuned model and refers to
# the source code of `TruthfulQAEvaluator` for more details.
#
# If you cannot provide available models for 'truth' and 'info',
# and want to perform basic metric eval, please set
# `metrics=('bleurt', 'rouge', 'bleu')`
# When key is set to "ENV", the key will be fetched from the environment
# variable $OPENAI_API_KEY. Otherwise, set key in here directly.
truthfulqa_eval_cfg = dict(
evaluator=dict(
type=TruthfulQAEvaluator, metrics=('truth', 'info'), key='ENV'), )
type=TruthfulQAEvaluator, metrics=('bleu'), key='ENV'), )
truthfulqa_datasets = [
dict(

View File

@ -29,6 +29,7 @@ winograd_datasets = [
abbr='winograd',
type=WinogradDataset,
path='winograd_wsc',
trust_remote_code=True,
name='wsc285',
reader_cfg=winograd_reader_cfg,
infer_cfg=winograd_infer_cfg,

View File

@ -33,6 +33,7 @@ winograd_datasets = [
abbr='winograd',
type=WinogradDataset,
path='winograd_wsc',
trust_remote_code=True,
name='wsc285',
reader_cfg=winograd_reader_cfg,
infer_cfg=winograd_infer_cfg,

View File

@ -1,12 +1,12 @@
from mmengine.config import read_base
with read_base():
from .models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1_model
from .models.mistral.hf_mistral_7b_v0_2 import models as hf_mistral_7b_v0_2_model
from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
from .models.hf_internlm.hf_internlm2_math_20b import models as hf_internlm2_math_20b_model
from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1_model
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import models as hf_mistral_7b_v0_2_model
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
from opencompass.configs.models.hf_internlm.hf_internlm2_math_20b import models as hf_internlm2_math_20b_model
from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets as datasets
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets as datasets
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -0,0 +1,196 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
# ## Math
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# ## Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name': 'core_average',
'subsets': [
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
# ['cmmlu', 'naive_average'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math','accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science','accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
'',
['bbh', 'extract_rate'],
['math', 'extract_rate'],
# ['openai_humaneval', 'extract_rate'],
['GPQA_diamond', 'extract_rate'],
# ['IFEval', 'extract_rate'],
'',
['mmlu', 'extract_rate'],
['mmlu-stem', 'extract_rate'],
['mmlu-social-science', 'extract_rate'],
['mmlu-humanities', 'extract_rate'],
['mmlu-other', 'extract_rate'],
'',
['mmlu_pro', 'extract_rate'],
['mmlu_pro_math', 'extract_rate'],
['mmlu_pro_physics', 'extract_rate'],
['mmlu_pro_chemistry', 'extract_rate'],
['mmlu_pro_law', 'extract_rate'],
['mmlu_pro_engineering', 'extract_rate'],
['mmlu_pro_other', 'extract_rate'],
['mmlu_pro_economics', 'extract_rate'],
['mmlu_pro_health', 'extract_rate'],
['mmlu_pro_psychology', 'extract_rate'],
['mmlu_pro_business', 'extract_rate'],
['mmlu_pro_biology', 'extract_rate'],
['mmlu_pro_philosophy', 'extract_rate'],
['mmlu_pro_computer_science', 'extract_rate'],
['mmlu_pro_history', 'extract_rate'],
'',
['cmmlu', 'extract_rate'],
['cmmlu-stem', 'extract_rate'],
['cmmlu-social-science', 'extract_rate'],
['cmmlu-humanities', 'extract_rate'],
['cmmlu-other', 'extract_rate'],
['cmmlu-china-specific', 'extract_rate'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_v1_9/'
work_dir = osp.join(base_exp_dir, 'chat_objective')

View File

@ -1,11 +1,11 @@
from mmengine.config import read_base
with read_base():
from .datasets.ceval.ceval_gen import ceval_datasets
from .datasets.cmmlu.cmmlu_gen import cmmlu_datasets
from .datasets.agieval.agieval_gen import agieval_datasets
from .datasets.bbh.bbh_gen import bbh_datasets
from .datasets.mmlu.mmlu_gen import mmlu_datasets
from .models.alaya.alaya import models
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets
from opencompass.configs.datasets.agieval.agieval_gen import agieval_datasets
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets
from opencompass.configs.models.alaya.alaya import models
datasets = [*bbh_datasets, *ceval_datasets, *cmmlu_datasets, *agieval_datasets, *mmlu_datasets]

9
configs/eval_api_demo.py Normal file
View File

@ -0,0 +1,9 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_chat_gen import math_datasets
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt4
datasets = gsm8k_datasets + math_datasets
models = gpt4

View File

@ -5,8 +5,8 @@ from opencompass.tasks import OpenICLAttackTask
with read_base():
# choose a list of datasets
from .datasets.promptbench.promptbench_wnli_gen_50662f import wnli_datasets
from .models.hf_vicuna_7b import models
from opencompass.configs.datasets.promptbench.promptbench_wnli_gen_50662f import wnli_datasets
from opencompass.configs.models.qwen.hf_qwen2_1_5b import models
datasets = wnli_datasets

View File

@ -1,10 +1,10 @@
from mmengine.config import read_base
with read_base():
from .datasets.demo.demo_gsm8k_base_gen import gsm8k_datasets
from .datasets.demo.demo_math_base_gen import math_datasets
from .models.qwen.hf_qwen2_1_5b import models as hf_qwen2_1_5b_models
from .models.hf_internlm.hf_internlm2_1_8b import models as hf_internlm2_1_8b_models
from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_base_gen import math_datasets
from opencompass.configs.models.qwen.hf_qwen2_1_5b import models as hf_qwen2_1_5b_models
from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import models as hf_internlm2_1_8b_models
datasets = gsm8k_datasets + math_datasets
models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models

View File

@ -1,9 +1,9 @@
from mmengine.config import read_base
with read_base():
from .datasets.lveval.lveval import LVEval_datasets as datasets
from .models.bluelm.hf_bluelm_7b_chat_32k import models
from .summarizers.lveval import summarizer
from opencompass.configs.datasets.lveval.lveval import LVEval_datasets as datasets
from opencompass.configs.models.bluelm.hf_bluelm_7b_chat_32k import models
from opencompass.configs.summarizers.lveval import summarizer
models[0][
'path'

View File

@ -1,57 +1,57 @@
from mmengine.config import read_base
from opencompass.models import OpenAI
from opencompassopencompass.configs.models import OpenAI
from opencompass.runners import LocalRunner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import CharmMemSummarizer
with read_base():
from .datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets
from opencompass.configs.datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets
# ------>>>>>> https://arxiv.org/abs/2403.14112
# from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# <<<<<<------ https://arxiv.org/abs/2403.14112
# from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -1,51 +1,51 @@
from mmengine.config import read_base
with read_base():
from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
# ------>>>>>> https://arxiv.org/abs/2403.14112
# from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# <<<<<<------ https://arxiv.org/abs/2403.14112
# from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
from .summarizers.charm_reason import summarizer

View File

@ -9,10 +9,10 @@ from lagent import ReAct
from lagent.agents.react import ReActProtocol
with read_base():
from .datasets.gsm8k.gsm8k_agent_gen_be1606 import gsm8k_datasets
from .datasets.math.math_agent_gen_af2293 import math_datasets
from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
from .summarizers.math_agent import summarizer
from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import gsm8k_datasets
from opencompass.configs.datasets.math.math_agent_gen_af2293 import math_datasets
from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
from opencompass.configs.summarizers.math_agent import summarizer
datasets = []
datasets += gsm8k_datasets

View File

@ -5,10 +5,10 @@ from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from .datasets.gsm8k.gsm8k_gen_d6de81 import gsm8k_datasets
from .datasets.math.math_gen_1ed9c2 import math_datasets
from .datasets.MathBench.mathbench_gen import mathbench_datasets
from .summarizers.math_baseline import summarizer
from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
from opencompass.configs.datasets.MathBench.mathbench_gen import mathbench_datasets
from opencompass.configs.summarizers.math_baseline import summarizer
datasets = []
datasets += gsm8k_datasets

View File

@ -1,96 +0,0 @@
from mmengine.config import read_base
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.agents.react import CIReAct, ReActProtocol
from opencompass.models.lagent import CodeAgent
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from .datasets.CIBench.CIBench_template_gen_e6b12a import \
cibench_datasets as datasets
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
models = [
dict(
abbr='gpt-3.5-code',
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(type=IPythonInterpreter,
description=IPYTHON_INTERPRETER_DESCRIPTION,
user_data_dir='./data/cibench_dataset/datasources')
],
protocol=dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
),
batch_size=1,
use_system_role=False, # use `user` role instead of system role
first_system_role=False, # use `user` role of the first instruction prompt
merge_adjacent_role=True, # merge adjacent same user content
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)

View File

@ -1,10 +1,10 @@
from mmengine.config import read_base
with read_base():
from .datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
from .datasets.demo.demo_math_chat_gen import math_datasets
from .models.qwen.hf_qwen2_1_5b_instruct import models as hf_qwen2_1_5b_instruct_models
from .models.hf_internlm.hf_internlm2_chat_1_8b import models as hf_internlm2_chat_1_8b_models
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_chat_gen import math_datasets
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import models as hf_qwen2_1_5b_instruct_models
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import models as hf_internlm2_chat_1_8b_models
datasets = gsm8k_datasets + math_datasets
models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models

View File

@ -7,7 +7,7 @@ from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets as datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets as datasets
models = [
dict(

View File

@ -1,8 +1,8 @@
from mmengine.config import read_base
with read_base():
from .datasets.ChemBench.ChemBench_gen import chembench_datasets
from .models.mistral.hf_mistral_7b_instruct_v0_2 import models
from opencompass.configs.datasets.ChemBench.ChemBench_gen import chembench_datasets
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import models
datasets = [*chembench_datasets]
models = [*models]

View File

@ -1,15 +1,37 @@
from copy import deepcopy
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models import OpenAI
from opencompass.models.lagent import LagentAgent
from lagent import ReAct
from lagent.agents.react import ReActProtocol
from opencompass.models.lagent import CodeAgent
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models.lagent import CodeAgent
from lagent.agents.react import ReActProtocol
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.partitioners import NaivePartitioner
with read_base():
from .datasets.CIBench.CIBench_gen_eb42f9 import cibench_datasets as datasets
# Note that it might occur cuda OOM error for hf model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.summarizers.cibench import summarizer
from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import cibench_datasets as cibench_datasets_template
from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import cibench_datasets as cibench_datasets_generation
# Oracle mode for analysis
# from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
# from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
datasets = []
datasets += cibench_datasets_template
datasets += cibench_datasets_generation
# datasets += cibench_datasets_template_oracle
# datasets += cibench_datasets_generation_oracle
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
@ -34,47 +56,87 @@ Also please follow the guidelines:
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
Begin!
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
models = [
dict(
abbr='gpt-3.5-turbo',
type=CodeAgent,
agent_type=CIReAct,
mutli_rounds=True,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(
type=IPythonInterpreter,
description=
'''It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.
'''),
],
protocol=dict(
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
actions=[dict(type=IPythonInterpreter, user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)]
protocol=dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
action=dict(role='ACTION', begin='Tool:', end='\n'),
action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
response=dict(role='RESPONSE', begin='Tool Response:', end='\n'),
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
),
batch_size=8,
),
]
)
work_dir = './outputs/cibench/'
_agent_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if all(r['role'].upper() != 'SYSTEM' for r in round): # no system round
if not any('api_role' in r for r in round):
m['meta_template']['round'].append(dict(role='system', begin='System response:', end='\n'))
else:
m['meta_template']['round'].append(dict(role='system', api_role='SYSTEM'))
print(f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}')
_agent_models.append(m)
protocol=dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
models = []
for m in _agent_models:
m = deepcopy(m)
origin_abbr = m.pop('abbr')
abbr = origin_abbr
m.pop('batch_size', None)
m.pop('max_out_len', None)
m.pop('max_seq_len', None)
run_cfg = m.pop('run_cfg', {})
agent_model = dict(
abbr=abbr,
summarizer_abbr=origin_abbr,
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=m,
actions=[dict(type=IPythonInterpreter, user_data_dir='./data/cibench_dataset/datasources', description=IPYTHON_INTERPRETER_DESCRIPTION)],
protocol=protocol,
batch_size=1,
run_cfg=run_cfg,
)
models.append(agent_model)
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=50, gen_task_coef=1),
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmRunner, max_num_workers=8, retry=2,
type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)

Some files were not shown because too many files have changed in this diff Show More