From 95aad6c28263a347a8873244eafcdb238f669025 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Tue, 3 Sep 2024 18:50:40 +0800 Subject: [PATCH 1/9] [Fix] Requirements update --- requirements/lmdeploy.txt | 1 + requirements/vllm.txt | 1 + 2 files changed, 2 insertions(+) create mode 100644 requirements/lmdeploy.txt create mode 100644 requirements/vllm.txt diff --git a/requirements/lmdeploy.txt b/requirements/lmdeploy.txt new file mode 100644 index 00000000..c237c383 --- /dev/null +++ b/requirements/lmdeploy.txt @@ -0,0 +1 @@ +lmdeploy diff --git a/requirements/vllm.txt b/requirements/vllm.txt new file mode 100644 index 00000000..e7a6c778 --- /dev/null +++ b/requirements/vllm.txt @@ -0,0 +1 @@ +vllm From da74cbfa3969a5e22407e637dec4b6a7d90bd769 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 4 Sep 2024 18:57:10 +0800 Subject: [PATCH 2/9] [Fix] Model configs update --- configs/models/hf_llama/vllm_llama_series.py | 6 +++--- opencompass/configs/models/hf_llama/vllm_llama_series.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/models/hf_llama/vllm_llama_series.py b/configs/models/hf_llama/vllm_llama_series.py index c9aaa607..5f777abf 100644 --- a/configs/models/hf_llama/vllm_llama_series.py +++ b/configs/models/hf_llama/vllm_llama_series.py @@ -10,9 +10,9 @@ settings = [ ('llama-2-70b-vllm', 'meta-llama/Llama-2-70b-hf', 4), ('llama-3-8b-vllm', 'meta-llama/Meta-Llama-3-8B', 1), ('llama-3-70b-vllm', 'meta-llama/Meta-Llama-3-70B', 4), - ('llama-3.1-8b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 1) - ('llama-3.1-70b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-70B-Instruct', 4) - ('llama-3.1-405b-fp8-instruct-vllm', 'meta-llama/Meta-Llama-3.1-405B-Instruct-FP8', 8) + ('llama-3.1-8b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 1), + ('llama-3.1-70b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-70B-Instruct', 4), + ('llama-3.1-405b-fp8-instruct-vllm', 'meta-llama/Meta-Llama-3.1-405B-Instruct-FP8', 8), ] models = [] diff --git a/opencompass/configs/models/hf_llama/vllm_llama_series.py b/opencompass/configs/models/hf_llama/vllm_llama_series.py index c9aaa607..5f777abf 100644 --- a/opencompass/configs/models/hf_llama/vllm_llama_series.py +++ b/opencompass/configs/models/hf_llama/vllm_llama_series.py @@ -10,9 +10,9 @@ settings = [ ('llama-2-70b-vllm', 'meta-llama/Llama-2-70b-hf', 4), ('llama-3-8b-vllm', 'meta-llama/Meta-Llama-3-8B', 1), ('llama-3-70b-vllm', 'meta-llama/Meta-Llama-3-70B', 4), - ('llama-3.1-8b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 1) - ('llama-3.1-70b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-70B-Instruct', 4) - ('llama-3.1-405b-fp8-instruct-vllm', 'meta-llama/Meta-Llama-3.1-405B-Instruct-FP8', 8) + ('llama-3.1-8b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 1), + ('llama-3.1-70b-instruct-vllm', 'meta-llama/Meta-Llama-3.1-70B-Instruct', 4), + ('llama-3.1-405b-fp8-instruct-vllm', 'meta-llama/Meta-Llama-3.1-405B-Instruct-FP8', 8), ] models = [] From fb6a0df652ebc76b1a2c61f3f4a3ec8fd865dadf Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 4 Sep 2024 19:24:09 +0800 Subject: [PATCH 3/9] [ci] fix test env for vllm and add vllm baselines (#1481) * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: zhulin1 --- .github/scripts/oc_score_assert.py | 34 ++++++---- .github/scripts/oc_score_baseline.yaml | 65 ++++++++++++++++--- .github/workflows/daily-run-test.yml | 48 ++++++++++---- .../advanced_guides/evaluation_turbomind.md | 2 +- .../advanced_guides/evaluation_turbomind.md | 2 +- opencompass/models/lmdeploy_pytorch.py | 22 ++++++- opencompass/models/turbomind.py | 6 +- .../models/turbomind_with_tf_above_v4_33.py | 12 ++-- 8 files changed, 142 insertions(+), 49 deletions(-) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index c454b772..1397db88 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -8,25 +8,29 @@ output_path = 'regression_result_daily' chat_model_list = [ 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf', - 'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', - 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind', - 'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind', - 'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf', + 'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf', + 'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind', + 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', + 'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind', + 'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf', + 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', + 'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', - 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', + 'phi-3-mini-4k-instruct-hf', 'phi-3-small-8k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind', + 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf', - 'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind', - 'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf', - 'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf', - 'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', + 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', + 'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf', + 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', + 'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind', + 'internlm2-7b-turbomind', 'internlm2-base-7b-hf', + 'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind', + 'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', - 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' ] dataset_list = ['gsm8k', 'race-middle', 'race-high'] @@ -75,6 +79,8 @@ class TestBase: for p2 in dataset_list]) def test_model_dataset_score(self, baseline_scores, result_scores, model, dataset): + if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high': + return base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(result_score, base_score) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 8ec8a5f7..d7e765be 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -18,6 +18,11 @@ deepseek-moe-16b-chat-hf: race-middle: 62 race-high: 70 +deepseek-7b-chat-vllm: + gsm8k: 63 + race-middle: 74 + race-high: 79 + gemma-2b-it-hf: gsm8k: 14 race-middle: 62 @@ -58,6 +63,11 @@ internlm2-chat-7b-sft-turbomind: race-middle: 91 race-high: 92 +internlm2-chat-7b-vllm: + gsm8k: 63 + race-middle: 90 + race-high: 91 + llama-3-8b-instruct-hf: gsm8k: 77 race-middle: 85 @@ -73,6 +83,11 @@ mistral-7b-instruct-v0.2-hf: race-middle: 82 race-high: 78 +mistral-7b-instruct-v0.2-vllm: + gsm8k: 49 + race-middle: 81 + race-high: 77 + minicpm-2b-dpo-fp32-hf: gsm8k: 58 race-middle: 66 @@ -93,6 +108,11 @@ phi-3-mini-4k-instruct-hf: race-middle: 81 race-high: 84 +phi-3-small-8k-instruct-hf: + gsm8k: 88 + race-middle: 89 + race-high: 88 + qwen1.5-0.5b-chat-hf: gsm8k: 5 race-middle: 55 @@ -108,6 +128,11 @@ qwen2-7b-instruct-turbomind: race-middle: 87 race-high: 89 +qwen1.5-0.5b-chat-vllm: + gsm8k: 5 + race-middle: 57 + race-high: 51 + yi-1.5-6b-chat-hf: gsm8k: 72 race-middle: 88 @@ -118,21 +143,26 @@ yi-1.5-9b-chat-hf: race-middle: 89 race-high: 91 -deepseek-moe-16b-base-hf: - gsm8k: 25 - race-middle: 35 - race-high: 23 - lmdeploy-api-test: gsm8k: 90 race-middle: 95 race-high: 96 +deepseek-moe-16b-base-hf: + gsm8k: 25 + race-middle: 35 + race-high: 23 + deepseek-7b-base-turbomind: gsm8k: 21 race-middle: 42 race-high: 42 +deepseek-moe-16b-base-vllm: + gsm8k: 22 + race-middle: 35 + race-high: 20 + gemma-2b-hf: gsm8k: 19 race-middle: 33 @@ -148,6 +178,16 @@ internlm2_5-7b-hf: race-middle: 92 race-high: 91 +internlm2-7b-hf: + gsm8k: 65 + race-middle: 77 + race-high: 72 + +internlm2-base-7b-hf: + gsm8k: 5 + race-middle: 71 + race-high: 74 + internlm2_5-7b-turbomind: gsm8k: 73 race-middle: 90 @@ -163,11 +203,6 @@ internlm2-7b-turbomind: race-middle: 78 race-high: 76 -internlm2-base-7b-hf: - gsm8k: 2 - race-middle: 71 - race-high: 74 - internlm2-base-7b-turbomind: gsm8k: 39 race-middle: 75 @@ -183,6 +218,11 @@ mistral-7b-v0.2-hf: race-middle: 42 race-high: 60 +mistral-7b-v0.2-vllm: + gsm8k: 45 + race-middle: 42 + race-high: 58 + qwen1.5-moe-a2.7b-hf: gsm8k: 64 race-middle: 78 @@ -203,6 +243,11 @@ qwen2-7b-turbomind: race-middle: 88 race-high: 88 +qwen1.5-0.5b-vllm: + gsm8k: 12 + race-middle: 54 + race-high: 59 + yi-1.5-6b-hf: gsm8k: 59 race-middle: 81 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index ebe66523..189d6e04 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -18,33 +18,55 @@ env: HF_DATASETS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 HF_HUB_OFFLINE: 1 + TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas jobs: + build-pypi: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 + - name: Build lagent + run: | + pip install wheel + python setup.py sdist bdist_wheel + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: dist/* + retention-days: 1 + name: my-artifact-${{ github.run_id }} + daily_run_test: + needs: build-pypi runs-on: self-hosted environment: 'prod' - timeout-minutes: 240 #4hours + timeout-minutes: 420 #7hours steps: - name: Clone repository uses: actions/checkout@v2 + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }} - name: Prepare - create conda env and install torch run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda create -y --name ${{env.CONDA_ENV}} python=3.10 conda activate ${{env.CONDA_ENV}} + pip install opencompass*.whl pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install bitsandbytes - pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - - name: Prepare - Pip install code - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} - pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} - pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs - name: Prepare - prepare data and hf model run: | diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md index 0f08ffec..c1299f0b 100644 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ b/docs/en/advanced_guides/evaluation_turbomind.md @@ -73,6 +73,6 @@ You are expected to get the evaluation results after the inference and evaluatio **Note**: - If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig) - and [EngineGenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) + and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) - If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` - If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line. diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md index 3d1414d8..a7c37b75 100644 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md @@ -70,6 +70,6 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2 **注:** -- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [EngineGenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) +- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) - 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` - 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。 diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py index f9d67da4..92118066 100644 --- a/opencompass/models/lmdeploy_pytorch.py +++ b/opencompass/models/lmdeploy_pytorch.py @@ -60,8 +60,8 @@ class LmdeployPytorchModel(BaseModel): engine_config.thread_safe = True if gen_config is not None: - from lmdeploy.messages import EngineGenerationConfig - gen_config = EngineGenerationConfig(**gen_config) + from lmdeploy.messages import GenerationConfig + gen_config = GenerationConfig(**gen_config) self.logger = get_logger() tm_model = tm.Engine(path, engine_config) @@ -70,6 +70,22 @@ class LmdeployPytorchModel(BaseModel): tm_model.create_instance() for i in range(concurrency) ] self.generator_ids = [i + 1 for i in range(concurrency)] + + from transformers import GenerationConfig + try: + generation_config = GenerationConfig.from_pretrained(path) + except Exception: + generation_config = None + if generation_config and hasattr(generation_config, 'eos_token_id'): + if gen_config.stop_words is None: + stop_words = [] + if isinstance(generation_config.eos_token_id, int): + stop_words.append(generation_config.eos_token_id) + else: + assert isinstance(generation_config.eos_token_id, list) + for token_id in generation_config.eos_token_id: + stop_words.append(token_id) + gen_config.stop_words = stop_words self.gen_config = gen_config self.end_str = end_str self.major_version, self.minor_version, _ = version_info @@ -135,7 +151,7 @@ class LmdeployPytorchModel(BaseModel): prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. end_str (str, optional): Whether to trim generated strings with end_str if the model has special ending strings diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index af769c28..e6cfebd2 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -113,8 +113,8 @@ class TurboMindModel(BaseModel): gen_config['stop_words'] = list(set(stop_words)) gen_config.setdefault('min_new_tokens', 1) - from lmdeploy.messages import EngineGenerationConfig - gen_config = EngineGenerationConfig(**gen_config) + from lmdeploy.messages import GenerationConfig + gen_config = GenerationConfig(**gen_config) results = [] for batch_input in batch_inputs: @@ -160,7 +160,7 @@ class TurboMindModel(BaseModel): The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. end_str (str, optional): Whether to trim generated strings with end_str if the model has special ending strings diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 5ea020a5..ccda31f6 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -115,11 +115,16 @@ class TurboMindModelwithChatTemplate(BaseModel): batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)] stop_words = list(set(self.stop_words + stopping_criteria)) + encode_stop_words = [] + if stop_words is not None and len(stop_words) > 0: + for words in stop_words: + encode_stop_words += self.tokenizer.encode(words, add_bos=False) + DEFAULT_GEN_CONFIG = { 'max_new_tokens': max_out_len, 'min_new_tokens': 1, 'top_k': 1, - 'stop_words': stop_words, + 'stop_words': encode_stop_words, } gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) @@ -127,9 +132,8 @@ class TurboMindModelwithChatTemplate(BaseModel): gen_config['top_k'] = 1000 gen_config['temperature'] = temperature - from lmdeploy.messages import EngineGenerationConfig, GenerationConfig + from lmdeploy.messages import GenerationConfig gen_config = GenerationConfig(**gen_config) - gen_config = EngineGenerationConfig.From(gen_config, self.tokenizer) results = [] for batch_message in batch_messages: @@ -160,7 +164,7 @@ class TurboMindModelwithChatTemplate(BaseModel): prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. Returns: str: The generated string. From 716d46e1f527759a3e1728be25ad603c26053a9f Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:43:45 +0800 Subject: [PATCH 4/9] [ci] fix badcase and add env info (#1491) * update * update --------- Co-authored-by: zhulin1 --- .github/scripts/oc_score_assert.py | 8 ++++---- .github/workflows/daily-run-test.yml | 1 + opencompass/models/lmdeploy_pytorch.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 1397db88..f869b157 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -16,10 +16,10 @@ chat_model_list = [ 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', 'phi-3-small-8k-instruct-hf', - 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind', - 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', - 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test' + 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', + 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', + 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', + 'lmdeploy-api-test' ] base_model_list = [ 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 189d6e04..7d7affaf 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -68,6 +68,7 @@ jobs: FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs + pip list - name: Prepare - prepare data and hf model run: | ln -s ${{env.DATEASET_CACHE_PATH}} data diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py index 92118066..bb195b64 100644 --- a/opencompass/models/lmdeploy_pytorch.py +++ b/opencompass/models/lmdeploy_pytorch.py @@ -88,7 +88,7 @@ class LmdeployPytorchModel(BaseModel): gen_config.stop_words = stop_words self.gen_config = gen_config self.end_str = end_str - self.major_version, self.minor_version, _ = version_info + self.major_version, self.minor_version = version_info[:2] def generate( self, From 6c9cd9a26012c6d4da1d135c8248cc0b212692a6 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 5 Sep 2024 17:22:42 +0800 Subject: [PATCH 5/9] [Feature] Needlebench auto-download update (#1480) * update * update * update --- configs/datasets/needlebench/atc/atc.py | 16 ++- .../datasets/needlebench/atc/atc_choice.py | 6 +- .../datasets/needlebench/atc/atc_choice_20.py | 7 +- .../datasets/needlebench/atc/atc_choice_50.py | 3 +- .../atc/atc_choice_50_en_reasoning.py | 73 +++++++++---- .../datasets/needlebench/atc/atc_choice_80.py | 3 +- .../atc/atc_choice_80_en_reasoning.py | 7 +- .../needlebench_multi_reasoning_1000k.py | 44 ++++---- .../needlebench_multi_retrieval_1000k.py | 20 ++-- .../needlebench_single_1000k.py | 18 +-- .../needlebench_multi_reasoning_128k.py | 44 ++++---- .../needlebench_multi_retrieval_128k.py | 20 ++-- .../needlebench_single_128k.py | 18 +-- .../needlebench_multi_reasoning_200k.py | 44 ++++---- .../needlebench_multi_retrieval_200k.py | 20 ++-- .../needlebench_single_200k.py | 18 +-- .../needlebench_multi_reasoning_256k.py | 44 ++++---- .../needlebench_multi_retrieval_256k.py | 20 ++-- .../needlebench_single_256k.py | 18 +-- .../needlebench_multi_reasoning_32k.py | 44 ++++---- .../needlebench_multi_retrieval_32k.py | 20 ++-- .../needlebench_32k/needlebench_single_32k.py | 18 +-- .../needlebench_multi_reasoning_4k.py | 76 ++++++------- .../needlebench_multi_retrieval_4k.py | 24 ++-- .../needlebench_4k/needlebench_single_4k.py | 26 +++-- .../needlebench_multi_reasoning_8k.py | 76 ++++++------- .../needlebench_multi_retrieval_8k.py | 24 ++-- ...ebench_multi_retrieval_compare_batch_8k.py | 26 +++-- .../needlebench_8k/needlebench_single_8k.py | 26 +++-- configs/summarizers/needlebench.py | 52 ++++----- .../configs/datasets/needlebench/atc/atc.py | 16 ++- .../datasets/needlebench/atc/atc_choice.py | 6 +- .../datasets/needlebench/atc/atc_choice_20.py | 7 +- .../datasets/needlebench/atc/atc_choice_50.py | 3 +- .../atc/atc_choice_50_en_reasoning.py | 73 +++++++++---- .../datasets/needlebench/atc/atc_choice_80.py | 3 +- .../atc/atc_choice_80_en_reasoning.py | 7 +- .../needlebench_multi_reasoning_1000k.py | 44 ++++---- .../needlebench_multi_retrieval_1000k.py | 20 ++-- .../needlebench_single_1000k.py | 18 +-- .../needlebench_multi_reasoning_128k.py | 44 ++++---- .../needlebench_multi_retrieval_128k.py | 20 ++-- .../needlebench_single_128k.py | 18 +-- .../needlebench_multi_reasoning_200k.py | 44 ++++---- .../needlebench_multi_retrieval_200k.py | 20 ++-- .../needlebench_single_200k.py | 18 +-- .../needlebench_multi_reasoning_256k.py | 44 ++++---- .../needlebench_multi_retrieval_256k.py | 20 ++-- .../needlebench_single_256k.py | 18 +-- .../needlebench_multi_reasoning_32k.py | 44 ++++---- .../needlebench_multi_retrieval_32k.py | 20 ++-- .../needlebench_32k/needlebench_single_32k.py | 18 +-- .../needlebench_multi_reasoning_4k.py | 76 ++++++------- .../needlebench_multi_retrieval_4k.py | 24 ++-- .../needlebench_4k/needlebench_single_4k.py | 26 +++-- .../needlebench_multi_reasoning_8k.py | 76 ++++++------- .../needlebench_multi_retrieval_8k.py | 24 ++-- ...ebench_multi_retrieval_compare_batch_8k.py | 26 +++-- .../needlebench_8k/needlebench_single_8k.py | 26 +++-- .../configs/summarizers/needlebench.py | 52 ++++----- opencompass/datasets/needlebench/atc.py | 84 ++++++++++---- .../datasets/needlebench/atc_choice.py | 58 +++++++--- opencompass/datasets/needlebench/multi.py | 24 ++-- opencompass/datasets/needlebench/origin.py | 28 ++--- opencompass/datasets/needlebench/parallel.py | 103 ++++++++++-------- opencompass/utils/datasets_info.py | 10 ++ 66 files changed, 1121 insertions(+), 896 deletions(-) diff --git a/configs/datasets/needlebench/atc/atc.py b/configs/datasets/needlebench/atc/atc.py index b2eeebea..af49b8c9 100644 --- a/configs/datasets/needlebench/atc/atc.py +++ b/configs/datasets/needlebench/atc/atc.py @@ -31,7 +31,9 @@ needlebench_eval_cfg = dict( needle_num_list = list(range(2, 100, 3)) document_depth_percent_intervals = 20 repeats = 30 -names_path = './data/needlebench/names.json' + +path = 'opencompass/needlebench' +file_name = 'names.json' needlebench_atc_datasets_zh = [] needlebench_atc_datasets_en = [] @@ -44,7 +46,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_en_ordered', 'type': NeedleBenchATCOrderedDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'English', 'repeats': repeats, @@ -61,7 +64,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_zh_ordered', 'type': NeedleBenchATCOrderedDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'Chinese', 'repeats': repeats, @@ -77,7 +81,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_en', 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'English', 'repeats': repeats, @@ -93,7 +98,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_zh', 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'Chinese', 'repeats': repeats, diff --git a/configs/datasets/needlebench/atc/atc_choice.py b/configs/datasets/needlebench/atc/atc_choice.py index 10b0b5d7..42cb510b 100644 --- a/configs/datasets/needlebench/atc/atc_choice.py +++ b/configs/datasets/needlebench/atc/atc_choice.py @@ -61,7 +61,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 20, 1)) -names_path = './data/needlebench/names.json' +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 @@ -122,7 +123,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/configs/datasets/needlebench/atc/atc_choice_20.py b/configs/datasets/needlebench/atc/atc_choice_20.py index 95e88f60..3c003292 100644 --- a/configs/datasets/needlebench/atc/atc_choice_20.py +++ b/configs/datasets/needlebench/atc/atc_choice_20.py @@ -61,8 +61,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 20, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -120,7 +120,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/configs/datasets/needlebench/atc/atc_choice_50.py b/configs/datasets/needlebench/atc/atc_choice_50.py index 03703a99..8ad6f9f6 100644 --- a/configs/datasets/needlebench/atc/atc_choice_50.py +++ b/configs/datasets/needlebench/atc/atc_choice_50.py @@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py b/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py index c515a84a..828a51da 100644 --- a/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py +++ b/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py @@ -10,14 +10,38 @@ from opencompass.utils.text_postprocessors import first_option_postprocess few_shot_prompts = { 'single_choice_prompts': { 'single_choice_en_reasoning': [ - dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"), - dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"), - dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"), - dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"), + dict( + role='HUMAN', + prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C", + ), + dict( + role='HUMAN', + prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D", + ), + dict( + role='HUMAN', + prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B", + ), + dict( + role='HUMAN', + prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A", + ), dict(role='HUMAN', prompt='Question: {question}'), ], }, @@ -25,8 +49,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 50, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -48,49 +72,54 @@ single_choice_prompts = needlebench_prompts['single_choice_prompts'] for _name in list(single_choice_prompts.keys()): if with_few_shot: assert few_shot_samples > 0 and few_shot_samples <= 4 - single_choice_prompts[_name] = \ - single_choice_prompts[_name][- few_shot_samples * 2 - 1:] + single_choice_prompts[_name] = single_choice_prompts[_name][ + -few_shot_samples * 2 - 1 : + ] # ----------------------- Dataset Settings ----------------------- # needlebench_datasets = [] -needlebench_atc_reader_cfg = dict(input_columns=['question'], - output_column='answer') +needlebench_atc_reader_cfg = dict(input_columns=['question'], output_column='answer') for _name in list(single_choice_prompts.keys()): needlebench_atc_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, - template=dict( - round=(single_choice_prompts[_name])), + template=dict(round=(single_choice_prompts[_name])), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer,), + inferencer=dict( + type=GenInferencer, + ), ) needlebench_atc_eval_cfg = dict( evaluator=dict(type=CircularEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) for num_needles in needle_num_list: - abbr = (f'NeedleBenchATCDataset-' - f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}') + abbr = ( + f'NeedleBenchATCDataset-' + f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}' + ) language = 'English' if 'en' in _name else 'Chinese' if 'reasoning' in _name: abbr += '-Reasoning' dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name':file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, 'with_circular': with_circular_eval, 'reader_cfg': needlebench_atc_reader_cfg, 'infer_cfg': needlebench_atc_infer_cfg, - 'eval_cfg': needlebench_atc_eval_cfg + 'eval_cfg': needlebench_atc_eval_cfg, } needlebench_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/atc/atc_choice_80.py b/configs/datasets/needlebench/atc/atc_choice_80.py index b4a935cb..c3ade6e0 100644 --- a/configs/datasets/needlebench/atc/atc_choice_80.py +++ b/configs/datasets/needlebench/atc/atc_choice_80.py @@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py b/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py index f05a4bdd..a4906b1f 100644 --- a/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py +++ b/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py @@ -25,8 +25,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 80, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -84,7 +84,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py index e31aa7ab..9b4c782b 100644 --- a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -69,7 +71,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -85,7 +87,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -96,7 +98,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -112,7 +114,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -123,7 +125,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -139,7 +141,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -150,7 +152,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -166,12 +168,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -184,7 +186,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -200,7 +202,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -211,7 +213,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -227,7 +229,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -238,7 +240,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -254,7 +256,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -265,7 +267,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -281,6 +283,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py index 775e825f..63320d8f 100644 --- a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_1000k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_1000k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_1000k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py index 49a1a40b..29d435bb 100644 --- a/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py @@ -41,21 +41,23 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,7 +66,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_1000k', + f'Depth{int(depth_percent)}_origin_en_1000k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -78,7 +80,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -90,7 +92,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_1000k', + f'Depth{int(depth_percent)}_origin_zh_1000k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -104,6 +106,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py index 44c47731..ef817f24 100644 --- a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py @@ -41,16 +41,18 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] @@ -58,7 +60,7 @@ document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -71,7 +73,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -87,7 +89,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -98,7 +100,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -114,7 +116,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -125,7 +127,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -141,7 +143,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -152,7 +154,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -168,12 +170,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -186,7 +188,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -202,7 +204,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -213,7 +215,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -229,7 +231,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -240,7 +242,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -256,7 +258,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -267,7 +269,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -283,6 +285,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py index b8d2f25f..c682441d 100644 --- a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_128k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_128k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_128k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_128k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py b/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py index 3a6a7836..15b4746b 100644 --- a/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -66,7 +68,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_128k', + f'Depth{int(depth_percent)}_origin_en_128k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -80,7 +82,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -92,7 +94,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_128k', + f'Depth{int(depth_percent)}_origin_zh_128k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -106,6 +108,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py index ba5741fe..a5504481 100644 --- a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -70,7 +72,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -86,7 +88,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,7 +99,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -113,7 +115,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -124,7 +126,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -140,7 +142,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -151,7 +153,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -167,12 +169,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -185,7 +187,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -201,7 +203,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -212,7 +214,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -228,7 +230,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -239,7 +241,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -255,7 +257,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -266,7 +268,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -282,6 +284,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py index 68e87ae9..9c7dadae 100644 --- a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_200k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_200k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -80,7 +81,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -89,8 +90,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_200k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_200k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -104,6 +104,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py b/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py index d3badb6b..ca99ea5a 100644 --- a/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,7 +67,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_200k', + f'Depth{int(depth_percent)}_origin_en_200k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -79,7 +81,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,7 +93,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_200k', + f'Depth{int(depth_percent)}_origin_zh_200k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -105,6 +107,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py index 0a7a0dd6..47208d82 100644 --- a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -70,7 +72,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -86,7 +88,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,7 +99,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -113,7 +115,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -124,7 +126,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -140,7 +142,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -151,7 +153,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -167,12 +169,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -185,7 +187,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -201,7 +203,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -212,7 +214,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -228,7 +230,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -239,7 +241,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -255,7 +257,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -266,7 +268,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -282,6 +284,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py index 8078d596..d8b4153e 100644 --- a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_256k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_256k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -80,7 +81,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -89,8 +90,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_256k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_256k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -104,6 +104,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py index 88652fbf..4c8ec0e0 100644 --- a/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,7 +67,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_256k', + f'Depth{int(depth_percent)}_origin_en_256k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -79,7 +81,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,7 +93,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_256k', + f'Depth{int(depth_percent)}_origin_zh_256k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -105,6 +107,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py index 143b1cc0..ea8428fb 100644 --- a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py @@ -41,16 +41,18 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] @@ -58,7 +60,7 @@ document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -71,7 +73,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -87,7 +89,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -98,7 +100,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -114,7 +116,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -125,7 +127,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -141,7 +143,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -152,7 +154,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -168,12 +170,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -186,7 +188,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -202,7 +204,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -213,7 +215,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -229,7 +231,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -240,7 +242,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -256,7 +258,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -267,7 +269,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -283,6 +285,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py index ea5122f5..19d90b6e 100644 --- a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_32k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_32k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_32k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_32k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py b/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py index 1a6be8c0..d5440e7a 100644 --- a/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -66,7 +68,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_32k', + f'Depth{int(depth_percent)}_origin_en_32k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -80,7 +82,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -92,7 +94,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_32k', + f'Depth{int(depth_percent)}_origin_zh_32k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -106,6 +108,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py index bb813f15..0912d1c0 100644 --- a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -68,11 +70,11 @@ language = 'English' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -88,7 +90,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -117,7 +119,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -146,7 +148,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -175,12 +177,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -191,11 +193,11 @@ language = 'Chinese' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -211,7 +213,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -240,7 +242,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -269,7 +271,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -298,6 +300,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py index d29aa40e..b3c85b8b 100644 --- a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py @@ -41,34 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type +) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_4k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_4k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -82,7 +83,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,8 +92,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_4k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_4k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -106,6 +106,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py b/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py index f8e03b52..095e1b33 100644 --- a/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py @@ -41,33 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_4k', + f'Depth{int(depth_percent)}_origin_en_4k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -81,7 +83,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_4k', + f'Depth{int(depth_percent)}_origin_zh_4k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -109,6 +111,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py index f43bbb06..3b58b86f 100644 --- a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -68,11 +70,11 @@ language = 'English' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -88,7 +90,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -117,7 +119,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -146,7 +148,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -175,12 +177,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -191,11 +193,11 @@ language = 'Chinese' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -211,7 +213,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -240,7 +242,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -269,7 +271,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -298,6 +300,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py index 5cbea895..a7bf0e28 100644 --- a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py @@ -41,34 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type +) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_8k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_8k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -82,7 +83,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,8 +92,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_8k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_8k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -106,6 +106,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py index 1bc4442f..67c2cf8c 100644 --- a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py @@ -41,36 +41,38 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals_list = [1, 5, 10, 15, 20] document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for document_depth_percent_intervals in document_depth_percent_intervals_list: depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type + ) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'_parallel_en_8k_batch{document_depth_percent_intervals}', + f'_parallel_en_8k_batch{document_depth_percent_intervals}', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -84,7 +86,7 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -94,14 +96,14 @@ needle_file_name = 'needles.jsonl' for document_depth_percent_intervals in document_depth_percent_intervals_list: depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type + ) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'_parallel_zh_8k_batch{document_depth_percent_intervals}', + f'_parallel_zh_8k_batch{document_depth_percent_intervals}', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -115,6 +117,6 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py b/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py index 368a0f54..9564a3cc 100644 --- a/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py @@ -41,33 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_8k', + f'Depth{int(depth_percent)}_origin_en_8k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -81,7 +83,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_8k', + f'Depth{int(depth_percent)}_origin_zh_8k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -109,6 +111,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index 69671db0..d1aeb984 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -23,9 +23,9 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts, multi_needle_en_list.extend(names_list) elif language == 'zh': multi_needle_zh_list.extend(names_list) - names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list - names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list - names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list + names_dict[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}'] = multi_needle_list + names_dict[f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}'] = multi_needle_en_list + names_dict[f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}'] = multi_needle_zh_list return names_dict @@ -56,9 +56,9 @@ def create_summarizer(context_lengths, depths, dataset_size, single_needle_en_list.extend(names_list) elif language == 'zh': single_needle_zh_list.extend(names_list) - names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list - names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list - names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list + names_dict[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}'] = single_needle_list + names_dict[f'Single-Needle-Retrieval-EN-{dataset_size.upper()}'] = single_needle_en_list + names_dict[f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}'] = single_needle_zh_list parallel_list = [] parallel_en_list = [] @@ -74,39 +74,39 @@ def create_summarizer(context_lengths, depths, dataset_size, parallel_en_list.extend(names_list) elif language == 'zh': parallel_zh_list.extend(names_list) - names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list - names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list - names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list + names_dict[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}'] = parallel_list + names_dict[f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}'] = parallel_en_list + names_dict[f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}'] = parallel_zh_list summary_groups = [ {'name': key, 'subsets': value} for key, value in names_dict.items() ] summary_groups.append({ - 'name': 'NeedleBench-Overall-Score', - 'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'], - ['Multi-Needle-Reasoning(M-RS)', 'naive_average'], - ['Multi-Needle-Retrieval(M-RT)', 'average_score']], - 'weights': {'Single-Needle-Retrieval(S-RT)': 0.4, - 'Multi-Needle-Reasoning(M-RS)': 0.3, - 'Multi-Needle-Retrieval(M-RT)': 0.3}}) + 'name': f'NeedleBench-Overall-Score-{dataset_size.upper()}', + 'subsets': [[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', 'naive_average'], + [f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', 'naive_average'], + [f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', 'average_score']], + 'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 0.4, + f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 0.3, + f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 0.3}}) summarizer_config = { 'type': NeedleBenchSummarizer, 'summary_groups': summary_groups, 'dataset_abbrs': [ - 'NeedleBench-Overall-Score', + f'NeedleBench-Overall-Score-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------', - 'Single-Needle-Retrieval(S-RT)', - 'Single-Needle-Retrieval-EN', - 'Single-Needle-Retrieval-ZH', + f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', + f'Single-Needle-Retrieval-EN-{dataset_size.upper()}', + f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------', - 'Multi-Needle-Retrieval(M-RT)', - 'Multi-Needle-Retrieval-EN', - 'Multi-Needle-Retrieval-ZH', + f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', + f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}', + f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------', - 'Multi-Needle-Reasoning(M-RS)', - 'Multi-Needle-Reasoning-EN', - 'Multi-Needle-Reasoning-ZH', + f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', + f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}', + f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}', f'2-Needle-EN-{dataset_size.upper()}', f'2-Needle-ZH-{dataset_size.upper()}', f'3-Needle-EN-{dataset_size.upper()}', diff --git a/opencompass/configs/datasets/needlebench/atc/atc.py b/opencompass/configs/datasets/needlebench/atc/atc.py index b2eeebea..af49b8c9 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc.py +++ b/opencompass/configs/datasets/needlebench/atc/atc.py @@ -31,7 +31,9 @@ needlebench_eval_cfg = dict( needle_num_list = list(range(2, 100, 3)) document_depth_percent_intervals = 20 repeats = 30 -names_path = './data/needlebench/names.json' + +path = 'opencompass/needlebench' +file_name = 'names.json' needlebench_atc_datasets_zh = [] needlebench_atc_datasets_en = [] @@ -44,7 +46,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_en_ordered', 'type': NeedleBenchATCOrderedDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'English', 'repeats': repeats, @@ -61,7 +64,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_zh_ordered', 'type': NeedleBenchATCOrderedDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'Chinese', 'repeats': repeats, @@ -77,7 +81,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_en', 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'English', 'repeats': repeats, @@ -93,7 +98,8 @@ for num_needles in needle_num_list: 'abbr': f'needlebench_atc_challenge' f'needle_{num_needles}_zh', 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': 'Chinese', 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice.py b/opencompass/configs/datasets/needlebench/atc/atc_choice.py index 10b0b5d7..42cb510b 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice.py @@ -61,7 +61,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 20, 1)) -names_path = './data/needlebench/names.json' +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 @@ -122,7 +123,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py b/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py index 95e88f60..3c003292 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py @@ -61,8 +61,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 20, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -120,7 +120,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py b/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py index 03703a99..8ad6f9f6 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py @@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py b/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py index c515a84a..828a51da 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py @@ -10,14 +10,38 @@ from opencompass.utils.text_postprocessors import first_option_postprocess few_shot_prompts = { 'single_choice_prompts': { 'single_choice_en_reasoning': [ - dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"), - dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"), - dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"), - dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"), - dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"), + dict( + role='HUMAN', + prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C", + ), + dict( + role='HUMAN', + prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D", + ), + dict( + role='HUMAN', + prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B", + ), + dict( + role='HUMAN', + prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n", + ), + dict( + role='BOT', + prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A", + ), dict(role='HUMAN', prompt='Question: {question}'), ], }, @@ -25,8 +49,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 50, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -48,49 +72,54 @@ single_choice_prompts = needlebench_prompts['single_choice_prompts'] for _name in list(single_choice_prompts.keys()): if with_few_shot: assert few_shot_samples > 0 and few_shot_samples <= 4 - single_choice_prompts[_name] = \ - single_choice_prompts[_name][- few_shot_samples * 2 - 1:] + single_choice_prompts[_name] = single_choice_prompts[_name][ + -few_shot_samples * 2 - 1 : + ] # ----------------------- Dataset Settings ----------------------- # needlebench_datasets = [] -needlebench_atc_reader_cfg = dict(input_columns=['question'], - output_column='answer') +needlebench_atc_reader_cfg = dict(input_columns=['question'], output_column='answer') for _name in list(single_choice_prompts.keys()): needlebench_atc_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, - template=dict( - round=(single_choice_prompts[_name])), + template=dict(round=(single_choice_prompts[_name])), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer,), + inferencer=dict( + type=GenInferencer, + ), ) needlebench_atc_eval_cfg = dict( evaluator=dict(type=CircularEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) for num_needles in needle_num_list: - abbr = (f'NeedleBenchATCDataset-' - f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}') + abbr = ( + f'NeedleBenchATCDataset-' + f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}' + ) language = 'English' if 'en' in _name else 'Chinese' if 'reasoning' in _name: abbr += '-Reasoning' dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name':file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, 'with_circular': with_circular_eval, 'reader_cfg': needlebench_atc_reader_cfg, 'infer_cfg': needlebench_atc_infer_cfg, - 'eval_cfg': needlebench_atc_eval_cfg + 'eval_cfg': needlebench_atc_eval_cfg, } needlebench_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py b/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py index b4a935cb..c3ade6e0 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py @@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py b/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py index f05a4bdd..a4906b1f 100644 --- a/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py +++ b/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py @@ -25,8 +25,8 @@ few_shot_prompts = { # ----------------------- Prompt Settings ----------------------- # needle_num_list = list(range(2, 80, 1)) -names_path = './data/needlebench/names.json' - +path = 'opencompass/needlebench' +file_name = 'names.json' repeats = 10 # Use Zero-Shot or not @@ -84,7 +84,8 @@ for _name in list(single_choice_prompts.keys()): dataset_dict = { 'abbr': abbr, 'type': NeedleBenchATCDataset, - 'path': names_path, + 'path': path, + 'file_name': file_name, 'num_needles': num_needles, 'language': language, 'repeats': repeats, diff --git a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py index e31aa7ab..9b4c782b 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -69,7 +71,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -85,7 +87,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -96,7 +98,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -112,7 +114,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -123,7 +125,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -139,7 +141,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -150,7 +152,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -166,12 +168,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -184,7 +186,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -200,7 +202,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -211,7 +213,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -227,7 +229,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -238,7 +240,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -254,7 +256,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -265,7 +267,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -281,6 +283,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py index 775e825f..63320d8f 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_1000k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_1000k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_1000k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py index 49a1a40b..29d435bb 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py @@ -41,21 +41,23 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,7 +66,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_1000k', + f'Depth{int(depth_percent)}_origin_en_1000k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -78,7 +80,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -90,7 +92,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_1000k', + f'Depth{int(depth_percent)}_origin_zh_1000k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -104,6 +106,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py index 44c47731..ef817f24 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py @@ -41,16 +41,18 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] @@ -58,7 +60,7 @@ document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -71,7 +73,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -87,7 +89,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -98,7 +100,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -114,7 +116,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -125,7 +127,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -141,7 +143,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -152,7 +154,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -168,12 +170,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -186,7 +188,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -202,7 +204,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -213,7 +215,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -229,7 +231,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -240,7 +242,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -256,7 +258,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -267,7 +269,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -283,6 +285,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py index b8d2f25f..c682441d 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_128k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_128k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_128k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_128k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py index 3a6a7836..15b4746b 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -66,7 +68,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_128k', + f'Depth{int(depth_percent)}_origin_en_128k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -80,7 +82,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -92,7 +94,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_128k', + f'Depth{int(depth_percent)}_origin_zh_128k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -106,6 +108,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py index ba5741fe..a5504481 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -70,7 +72,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -86,7 +88,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,7 +99,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -113,7 +115,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -124,7 +126,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -140,7 +142,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -151,7 +153,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -167,12 +169,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -185,7 +187,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -201,7 +203,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -212,7 +214,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -228,7 +230,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -239,7 +241,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -255,7 +257,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -266,7 +268,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -282,6 +284,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py index 68e87ae9..9c7dadae 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_200k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_200k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -80,7 +81,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -89,8 +90,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_200k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_200k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -104,6 +104,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py index d3badb6b..ca99ea5a 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,7 +67,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_200k', + f'Depth{int(depth_percent)}_origin_en_200k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -79,7 +81,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,7 +93,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_200k', + f'Depth{int(depth_percent)}_origin_zh_200k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -105,6 +107,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py index 0a7a0dd6..47208d82 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -70,7 +72,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -86,7 +88,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,7 +99,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -113,7 +115,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -124,7 +126,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -140,7 +142,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -151,7 +153,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -167,12 +169,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -185,7 +187,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -201,7 +203,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -212,7 +214,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -228,7 +230,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -239,7 +241,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -255,7 +257,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -266,7 +268,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -282,6 +284,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py index 8078d596..d8b4153e 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_256k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_256k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -80,7 +81,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -89,8 +90,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_256k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_256k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -104,6 +104,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py index 88652fbf..4c8ec0e0 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) # context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) context_lengths = [32000, 128000, 256000] depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -65,7 +67,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_256k', + f'Depth{int(depth_percent)}_origin_en_256k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -79,7 +81,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,7 +93,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_256k', + f'Depth{int(depth_percent)}_origin_zh_256k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -105,6 +107,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py index 143b1cc0..ea8428fb 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py @@ -41,16 +41,18 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] @@ -58,7 +60,7 @@ document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -71,7 +73,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -87,7 +89,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -98,7 +100,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -114,7 +116,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -125,7 +127,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -141,7 +143,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -152,7 +154,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -168,12 +170,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -186,7 +188,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -202,7 +204,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -213,7 +215,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -229,7 +231,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -240,7 +242,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -256,7 +258,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -267,7 +269,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -283,6 +285,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py index ea5122f5..19d90b6e 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py @@ -41,22 +41,24 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_32k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_32k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -79,7 +80,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -88,8 +89,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_32k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_32k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -103,6 +103,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py index 1a6be8c0..d5440e7a 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' @@ -66,7 +68,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_32k', + f'Depth{int(depth_percent)}_origin_en_32k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -80,7 +82,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -92,7 +94,7 @@ for original_context_length in context_lengths: for depth_percent in depths_list: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_32k', + f'Depth{int(depth_percent)}_origin_zh_32k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -106,6 +108,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py index bb813f15..0912d1c0 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -68,11 +70,11 @@ language = 'English' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -88,7 +90,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -117,7 +119,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -146,7 +148,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -175,12 +177,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -191,11 +193,11 @@ language = 'Chinese' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -211,7 +213,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -240,7 +242,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -269,7 +271,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -298,6 +300,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py index d29aa40e..b3c85b8b 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py @@ -41,34 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type +) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_4k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_4k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -82,7 +83,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,8 +92,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_4k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_4k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -106,6 +106,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py index f8e03b52..095e1b33 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py @@ -41,33 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(1000, 5000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_4k', + f'Depth{int(depth_percent)}_origin_en_4k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -81,7 +83,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_4k', + f'Depth{int(depth_percent)}_origin_zh_4k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -109,6 +111,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py index f43bbb06..3b58b86f 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py @@ -41,23 +41,25 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchMultiEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' # ----------English Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needle_file_name = 'multi_needle_reasoning_en.json' @@ -68,11 +70,11 @@ language = 'English' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -88,7 +90,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_en_datasets.append(dataset_dict) @@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -117,7 +119,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_en_datasets.append(dataset_dict) @@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -146,7 +148,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_en_datasets.append(dataset_dict) @@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -175,12 +177,12 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_en_datasets.append(dataset_dict) # ----------Chinese Version---------- -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['zh_finance.jsonl'] needle_file_name = 'multi_needle_reasoning_zh.json' @@ -191,11 +193,11 @@ language = 'Chinese' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -211,7 +213,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_2needle_zh_datasets.append(dataset_dict) @@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -240,7 +242,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_3needle_zh_datasets.append(dataset_dict) @@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -269,7 +271,7 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_4needle_zh_datasets.append(dataset_dict) @@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', 'type': NeedleBenchMultiDataset, 'path': base_path, 'length': original_context_length, @@ -298,6 +300,6 @@ for original_context_length in context_lengths: 'diff': diff, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py index 5cbea895..a7bf0e28 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py @@ -41,34 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type +) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_en_8k', + 'abbr': f'Length{original_context_length}' f'_parallel_en_8k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -82,7 +83,7 @@ for original_context_length in context_lengths: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,8 +92,7 @@ needlebench_zh_datasets = [] for original_context_length in context_lengths: dataset_dict = { - 'abbr': f'Length{original_context_length}' - f'_parallel_zh_8k', + 'abbr': f'Length{original_context_length}' f'_parallel_zh_8k', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -106,6 +106,6 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py index 1bc4442f..67c2cf8c 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py @@ -41,36 +41,38 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchParallelEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals_list = [1, 5, 10, 15, 20] document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for document_depth_percent_intervals in document_depth_percent_intervals_list: depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type + ) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'_parallel_en_8k_batch{document_depth_percent_intervals}', + f'_parallel_en_8k_batch{document_depth_percent_intervals}', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -84,7 +86,7 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list: 'language': 'English', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -94,14 +96,14 @@ needle_file_name = 'needles.jsonl' for document_depth_percent_intervals in document_depth_percent_intervals_list: depths_float = generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type) + document_depth_percent_intervals, document_depth_percent_interval_type + ) depths = [int(depth) for depth in depths_float] for original_context_length in context_lengths: dataset_dict = { 'abbr': f'Length{original_context_length}' - f'_parallel_zh_8k_batch{document_depth_percent_intervals}', + f'_parallel_zh_8k_batch{document_depth_percent_intervals}', 'type': NeedleBenchParallelDataset, 'path': base_path, 'needle_file_name': needle_file_name, @@ -115,6 +117,6 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list: 'language': 'Chinese', 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py index 368a0f54..9564a3cc 100644 --- a/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py +++ b/opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py @@ -41,33 +41,35 @@ needlebench_infer_cfg = dict( dict(role='HUMAN', prompt='{prompt}'), dict(role='BOT', prompt='{answer}\n'), ] - ) ), + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer), +) needlebench_eval_cfg = dict( evaluator=dict(type=NeedleBenchOriginEvaluator), pred_postprocessor=dict(type=needlebench_postprocess), dataset_postprocessor=dict(type=needlebench_dataset_postprocess), - pred_role='BOT') + pred_role='BOT', +) context_lengths = list(range(5000, 9000, 1000)) document_depth_percent_intervals = 20 document_depth_percent_interval_type = 'linear' -base_path = './data/needlebench' +base_path = 'opencompass/needlebench' file_list = ['PaulGrahamEssays.jsonl'] needlebench_en_datasets = [] needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_en_8k', + f'Depth{int(depth_percent)}_origin_en_8k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -81,7 +83,7 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_en_datasets.append(dataset_dict) @@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl' for original_context_length in context_lengths: for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): + document_depth_percent_intervals, document_depth_percent_interval_type + ): dataset_dict = { 'abbr': f'Length{original_context_length}' - f'Depth{int(depth_percent)}_origin_zh_8k', + f'Depth{int(depth_percent)}_origin_zh_8k', 'type': NeedleBenchOriginDataset, 'path': base_path, 'length': original_context_length, @@ -109,6 +111,6 @@ for original_context_length in context_lengths: 'needle_file_name': needle_file_name, 'reader_cfg': needlebench_reader_cfg, 'infer_cfg': needlebench_infer_cfg, - 'eval_cfg': needlebench_eval_cfg + 'eval_cfg': needlebench_eval_cfg, } needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/summarizers/needlebench.py b/opencompass/configs/summarizers/needlebench.py index 69671db0..d1aeb984 100644 --- a/opencompass/configs/summarizers/needlebench.py +++ b/opencompass/configs/summarizers/needlebench.py @@ -23,9 +23,9 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts, multi_needle_en_list.extend(names_list) elif language == 'zh': multi_needle_zh_list.extend(names_list) - names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list - names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list - names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list + names_dict[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}'] = multi_needle_list + names_dict[f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}'] = multi_needle_en_list + names_dict[f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}'] = multi_needle_zh_list return names_dict @@ -56,9 +56,9 @@ def create_summarizer(context_lengths, depths, dataset_size, single_needle_en_list.extend(names_list) elif language == 'zh': single_needle_zh_list.extend(names_list) - names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list - names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list - names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list + names_dict[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}'] = single_needle_list + names_dict[f'Single-Needle-Retrieval-EN-{dataset_size.upper()}'] = single_needle_en_list + names_dict[f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}'] = single_needle_zh_list parallel_list = [] parallel_en_list = [] @@ -74,39 +74,39 @@ def create_summarizer(context_lengths, depths, dataset_size, parallel_en_list.extend(names_list) elif language == 'zh': parallel_zh_list.extend(names_list) - names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list - names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list - names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list + names_dict[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}'] = parallel_list + names_dict[f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}'] = parallel_en_list + names_dict[f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}'] = parallel_zh_list summary_groups = [ {'name': key, 'subsets': value} for key, value in names_dict.items() ] summary_groups.append({ - 'name': 'NeedleBench-Overall-Score', - 'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'], - ['Multi-Needle-Reasoning(M-RS)', 'naive_average'], - ['Multi-Needle-Retrieval(M-RT)', 'average_score']], - 'weights': {'Single-Needle-Retrieval(S-RT)': 0.4, - 'Multi-Needle-Reasoning(M-RS)': 0.3, - 'Multi-Needle-Retrieval(M-RT)': 0.3}}) + 'name': f'NeedleBench-Overall-Score-{dataset_size.upper()}', + 'subsets': [[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', 'naive_average'], + [f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', 'naive_average'], + [f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', 'average_score']], + 'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 0.4, + f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 0.3, + f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 0.3}}) summarizer_config = { 'type': NeedleBenchSummarizer, 'summary_groups': summary_groups, 'dataset_abbrs': [ - 'NeedleBench-Overall-Score', + f'NeedleBench-Overall-Score-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------', - 'Single-Needle-Retrieval(S-RT)', - 'Single-Needle-Retrieval-EN', - 'Single-Needle-Retrieval-ZH', + f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', + f'Single-Needle-Retrieval-EN-{dataset_size.upper()}', + f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------', - 'Multi-Needle-Retrieval(M-RT)', - 'Multi-Needle-Retrieval-EN', - 'Multi-Needle-Retrieval-ZH', + f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', + f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}', + f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}', f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------', - 'Multi-Needle-Reasoning(M-RS)', - 'Multi-Needle-Reasoning-EN', - 'Multi-Needle-Reasoning-ZH', + f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', + f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}', + f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}', f'2-Needle-EN-{dataset_size.upper()}', f'2-Needle-ZH-{dataset_size.upper()}', f'3-Needle-EN-{dataset_size.upper()}', diff --git a/opencompass/datasets/needlebench/atc.py b/opencompass/datasets/needlebench/atc.py index e7a95747..c715b19b 100644 --- a/opencompass/datasets/needlebench/atc.py +++ b/opencompass/datasets/needlebench/atc.py @@ -1,11 +1,13 @@ # flake8: noqa import json +import os import random from datasets import Dataset from opencompass.datasets.base import BaseDataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path @LOAD_DATASET.register_module() @@ -14,13 +16,20 @@ class NeedleBenchATCDataset(BaseDataset): @staticmethod def load( path, + file_name: str, num_needles: int, language: str, repeats: int, ): data = {'prompt': [], 'answer': []} + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download - with open(path, 'r', encoding='utf-8') as file: + path = snapshot_download(repo_id=path, repo_type='dataset') + file_path = os.path.join(path, file_name) + + with open(file_path, 'r', encoding='utf-8') as file: names_data = json.load(file) all_names = names_data[language].split(',') @@ -30,7 +39,16 @@ class NeedleBenchATCDataset(BaseDataset): if language == 'Chinese': relationship_terms = [ - '父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆' + '父亲', + '母亲', + '爸爸', + '妈妈', + '爷爷', + '奶奶', + '姥姥', + '姥爷', + '外公', + '外婆', ] relationship_templates = [ @@ -46,10 +64,16 @@ class NeedleBenchATCDataset(BaseDataset): elif language == 'English': relationship_terms = [ - 'father', 'mother', 'dad', 'mom', 'grandfather', - 'grandmother', 'maternal grandmother', - 'maternal grandfather', 'paternal grandfather', - 'paternal grandmother' + 'father', + 'mother', + 'dad', + 'mom', + 'grandfather', + 'grandmother', + 'maternal grandmother', + 'maternal grandfather', + 'paternal grandfather', + 'paternal grandmother', ] relationship_templates = [ @@ -96,21 +120,20 @@ class NeedleBenchATCDataset(BaseDataset): # Generating the prompt based on the language if language == 'Chinese': - prompt = (f""" + prompt = f""" 在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁? 例如: 例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。 例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。 例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。 -""") +""" elif language == 'English': - prompt = (f""" + prompt = f""" Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context? For example: Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark. Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang. Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang.""" - ) else: prompt = 'Language not supported.' raise Exception('Unsupported language specified. ' @@ -135,13 +158,20 @@ class NeedleBenchATCOrderedDataset(BaseDataset): @staticmethod def load( path, + file_name, num_needles: int, language: str, repeats: int, ): data = {'prompt': [], 'answer': []} + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download - with open(path, 'r', encoding='utf-8') as file: + path = snapshot_download(repo_id=path, repo_type='dataset') + file_path = os.path.join(path, file_name) + + with open(file_path, 'r', encoding='utf-8') as file: names_data = json.load(file) all_names = names_data[language].split(',') @@ -151,7 +181,16 @@ class NeedleBenchATCOrderedDataset(BaseDataset): if language == 'Chinese': relationship_terms = [ - '父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆' + '父亲', + '母亲', + '爸爸', + '妈妈', + '爷爷', + '奶奶', + '姥姥', + '姥爷', + '外公', + '外婆', ] relationship_templates = [ @@ -167,10 +206,16 @@ class NeedleBenchATCOrderedDataset(BaseDataset): elif language == 'English': relationship_terms = [ - 'father', 'mother', 'dad', 'mom', 'grandfather', - 'grandmother', 'maternal grandmother', - 'maternal grandfather', 'paternal grandfather', - 'paternal grandmother' + 'father', + 'mother', + 'dad', + 'mom', + 'grandfather', + 'grandmother', + 'maternal grandmother', + 'maternal grandfather', + 'paternal grandfather', + 'paternal grandmother', ] relationship_templates = [ @@ -214,21 +259,20 @@ class NeedleBenchATCOrderedDataset(BaseDataset): # Generating the prompt based on the language if language == 'Chinese': - prompt = (f""" + prompt = f""" 在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁? 例如: 例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。 例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。 例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。 -""") +""" elif language == 'English': - prompt = (f""" + prompt = f""" Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context? For example: Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark. Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang. Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang.""" - ) else: prompt = 'Language not supported.' raise Exception('Unsupported language specified. ' diff --git a/opencompass/datasets/needlebench/atc_choice.py b/opencompass/datasets/needlebench/atc_choice.py index af1d668e..1f586035 100644 --- a/opencompass/datasets/needlebench/atc_choice.py +++ b/opencompass/datasets/needlebench/atc_choice.py @@ -1,11 +1,13 @@ # flake8: noqa import copy import json +import os import random from datasets import Dataset from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from ..base import BaseDataset @@ -46,11 +48,14 @@ def get_circular_example(entry, id): class NeedleBenchATCDataset(BaseDataset): @staticmethod - def load(path: str, - num_needles: int, - language: str, - repeats: int, - with_circular: bool = True): + def load( + path: str, + file_name: str, + num_needles: int, + language: str, + repeats: int, + with_circular: bool = True, + ): """NeedleBenthATC Dataset. Args: @@ -61,8 +66,14 @@ class NeedleBenchATCDataset(BaseDataset): """ data = [] entry = {} + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download - with open(path, 'r', encoding='utf-8') as file: + path = snapshot_download(repo_id=path, repo_type='dataset') + file_path = os.path.join(path, file_name) + + with open(file_path, 'r', encoding='utf-8') as file: names_data = json.load(file) all_names = names_data[language].split(',') @@ -73,7 +84,16 @@ class NeedleBenchATCDataset(BaseDataset): if language == 'Chinese': relationship_terms = [ - '父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆' + '父亲', + '母亲', + '爸爸', + '妈妈', + '爷爷', + '奶奶', + '姥姥', + '姥爷', + '外公', + '外婆', ] relationship_templates = [ @@ -89,10 +109,16 @@ class NeedleBenchATCDataset(BaseDataset): elif language == 'English': relationship_terms = [ - 'father', 'mother', 'dad', 'mom', 'grandfather', - 'grandmother', 'maternal grandmother', - 'maternal grandfather', 'paternal grandfather', - 'paternal grandmother' + 'father', + 'mother', + 'dad', + 'mom', + 'grandfather', + 'grandmother', + 'maternal grandmother', + 'maternal grandfather', + 'paternal grandfather', + 'paternal grandmother', ] relationship_templates = [ @@ -139,12 +165,11 @@ class NeedleBenchATCDataset(BaseDataset): # Generating the prompt based on the language if language == 'Chinese': - prompt = (f""" -在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?""") + prompt = f""" +在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?""" elif language == 'English': - prompt = (f""" + prompt = f""" Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?""" - ) else: prompt = 'Language not supported.' raise Exception('Unsupported language specified. ' @@ -158,7 +183,8 @@ Given the scrambled family relationships described above, who is the eldest rela additional_names_needed = max(4 - len(names), 0) additional_names = random.sample( [name for name in all_names if name not in names], - additional_names_needed) + additional_names_needed, + ) names.extend(additional_names) entry['options'] = names[0:4] diff --git a/opencompass/datasets/needlebench/multi.py b/opencompass/datasets/needlebench/multi.py index cbb96b1c..223fc216 100644 --- a/opencompass/datasets/needlebench/multi.py +++ b/opencompass/datasets/needlebench/multi.py @@ -4,11 +4,11 @@ import random import tiktoken from datasets import Dataset -from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path def get_random_needles(counter, file_path, needle_count): @@ -37,7 +37,7 @@ class NeedleBenchMultiDataset(BaseDataset): @staticmethod def load( - path: str, # depreciated + path: str, length: int, depth: int, tokenizer_model: str, @@ -152,25 +152,21 @@ class NeedleBenchMultiDataset(BaseDataset): return prompt - repo_id = 'opencompass/NeedleBench' file_names = [ 'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json', 'multi_needle_reasoning_zh.json', 'zh_finance.jsonl', 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', 'zh_movie.jsonl', 'zh_tech.jsonl' ] - downloaded_files = [] - base_file_path = '' - for file_name in file_names: - file_path = hf_hub_download(repo_id=repo_id, - filename=file_name, - repo_type='dataset') - downloaded_files.append(file_path) - base_file_path = '/'.join(file_path.split('/')[:-1]) + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download + path = snapshot_download(repo_id=path, repo_type='dataset') + needle_file_path = os.path.join(path, needle_file_name) - needle_file_path = os.path.join(base_file_path, needle_file_name) - for file_path in downloaded_files: - if file_path.split('/')[-1] not in file_list: + for file_name in file_names: + file_path = os.path.join(path, file_name) + if file_name not in file_list: continue with open(file_path, 'r', encoding='utf-8') as f: diff --git a/opencompass/datasets/needlebench/origin.py b/opencompass/datasets/needlebench/origin.py index da3e6cc8..3359eae7 100644 --- a/opencompass/datasets/needlebench/origin.py +++ b/opencompass/datasets/needlebench/origin.py @@ -5,11 +5,11 @@ import re import tiktoken from datasets import Dataset -from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path def get_random_line_by_language(counter, file_path, language): @@ -36,7 +36,7 @@ class NeedleBenchOriginDataset(BaseDataset): @staticmethod def load( - path: str, # depreciated + path: str, length: int, depth: int, tokenizer_model: str, @@ -128,33 +128,29 @@ class NeedleBenchOriginDataset(BaseDataset): return prompt - repo_id = 'opencompass/NeedleBench' file_names = [ - 'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl', + 'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json', + 'multi_needle_reasoning_zh.json', 'zh_finance.jsonl', 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', 'zh_movie.jsonl', 'zh_tech.jsonl' ] + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download + path = snapshot_download(repo_id=path, repo_type='dataset') + needle_file_path = os.path.join(path, needle_file_name) - downloaded_files = [] - base_file_path = '' for file_name in file_names: - file_path = hf_hub_download(repo_id=repo_id, - filename=file_name, - repo_type='dataset') - downloaded_files.append(file_path) - base_file_path = '/'.join(file_path.split('/')[:-1]) - - for file_path in downloaded_files: - if file_path.split('/')[-1] not in file_list: + file_path = os.path.join(path, file_name) + if file_name not in file_list: continue + with open(file_path, 'r', encoding='utf-8') as f: lines_bak = [json.loads(line.strip()) for line in f] lines = lines_bak.copy() for counter in range(num_repeats_per_file): random.seed(counter) random.shuffle(lines) - needle_file_path = os.path.join(base_file_path, - needle_file_name) random_needle = get_random_line_by_language( counter, needle_file_path, language) needle = '\n' + random_needle['needle'] + '\n' diff --git a/opencompass/datasets/needlebench/parallel.py b/opencompass/datasets/needlebench/parallel.py index 59c971f1..8158afbb 100644 --- a/opencompass/datasets/needlebench/parallel.py +++ b/opencompass/datasets/needlebench/parallel.py @@ -1,21 +1,24 @@ import json +import os import random import tiktoken from datasets import Dataset -from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path -def get_unique_entries(file_path, - n, - language, - unique_arg1=False, - unique_arg2=False, - unique_combination=False): +def get_unique_entries( + file_path, + n, + language, + unique_arg1=False, + unique_arg2=False, + unique_combination=False, +): seen_arg1 = set() seen_arg2 = set() seen_combinations = set() @@ -38,9 +41,11 @@ def get_unique_entries(file_path, key2 = entry.get('arg2', '') if unique_arg2 else '' combination = (key1, key2) if unique_combination else '' - if (key1 not in seen_arg1 or not unique_arg1) and \ - (key2 not in seen_arg2 or not unique_arg2) and \ - (combination not in seen_combinations or not unique_combination): + if ((key1 not in seen_arg1 or not unique_arg1) # noqa: E501 + and (key2 not in seen_arg2 or not unique_arg2) + and # noqa: E501 + (combination not in seen_combinations + or not unique_combination)): # noqa: E501 seen_arg1.add(key1) seen_arg2.add(key2) seen_combinations.add(combination) @@ -57,7 +62,7 @@ class NeedleBenchParallelDataset(BaseDataset): @staticmethod def load( - path: str, # depreciated + path: str, needle_file_name: str, length: int, depths: list[int], @@ -72,30 +77,32 @@ class NeedleBenchParallelDataset(BaseDataset): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) - repo_id = 'opencompass/NeedleBench' file_names = [ - 'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl', - 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', - 'zh_movie.jsonl', 'zh_tech.jsonl' + 'PaulGrahamEssays.jsonl', + 'multi_needle_reasoning_en.json', + 'multi_needle_reasoning_zh.json', + 'zh_finance.jsonl', + 'zh_game.jsonl', + 'zh_general.jsonl', + 'zh_government.jsonl', + 'zh_movie.jsonl', + 'zh_tech.jsonl', ] + path = get_data_path(path) + if os.environ.get('DATASET_SOURCE') == 'HF': + from huggingface_hub import snapshot_download - downloaded_files = [] - for file_name in file_names: - file_path = hf_hub_download(repo_id=repo_id, - filename=file_name, - repo_type='dataset') - downloaded_files.append(file_path) + path = snapshot_download(repo_id=path, repo_type='dataset') + needle_file_path = os.path.join(path, needle_file_name) - for file in downloaded_files: - if file.split('/')[-1] == needle_file_name: - needle_file_path = file - - predefined_needles_bak = get_unique_entries(needle_file_path, - len(depths), - language, - unique_arg1=True, - unique_arg2=True, - unique_combination=True) + predefined_needles_bak = get_unique_entries( + needle_file_path, + len(depths), + language, + unique_arg1=True, + unique_arg2=True, + unique_combination=True, + ) def _generate_context(tokens_context, depths, needles): insertion_points = [ @@ -108,10 +115,12 @@ class NeedleBenchParallelDataset(BaseDataset): needle_tokens = _get_tokens_from_context(needle) current_insertion_point = min( insertion_points[i] + cumulative_inserted_length, - len(tokens_context)) + len(tokens_context), + ) - tokens_context = tokens_context[:current_insertion_point] + \ - needle_tokens + tokens_context[current_insertion_point:] + tokens_context = (tokens_context[:current_insertion_point] + + needle_tokens + + tokens_context[current_insertion_point:]) cumulative_inserted_length += len(needle_tokens) new_context = _decode_tokens(tokens_context) @@ -191,8 +200,9 @@ class NeedleBenchParallelDataset(BaseDataset): return prompt - for file_path in downloaded_files: - if file_path.split('/')[-1] not in file_list: + for file_name in file_names: + file_path = os.path.join(path, file_name) + if file_name not in file_list: continue with open(file_path, 'r', encoding='utf-8') as f: @@ -219,8 +229,8 @@ class NeedleBenchParallelDataset(BaseDataset): item['retrieval_question'].split("'")[1].split('。')[0] for item in predefined_needles ]) - retrieval_question = questions + "请按照'" + \ - answers_format + "'的格式回答。" + retrieval_question = (questions + "请按照'" + answers_format + + "'的格式回答。") elif language == 'English': questions = '、'.join([ item['retrieval_question'].split('?')[0] + '?' @@ -231,14 +241,14 @@ class NeedleBenchParallelDataset(BaseDataset): item['retrieval_question'].split("'")[1].split('.')[0] for item in predefined_needles ]) - retrieval_question = questions + \ - "Please answer in the format of '" + \ - answers_format + "'" + retrieval_question = (questions + + "Please answer in the format of '" + + answers_format + "'") context_length = length - length_buffer - target_length_per_record = context_length - \ - sum(len(tokens) for tokens - in _get_tokens_from_context(needles)) + target_length_per_record = context_length - sum( + len(tokens) + for tokens in _get_tokens_from_context(needles)) target_length_per_record = max(target_length_per_record, 0) accumulated_tokens = [] for line in lines: @@ -317,7 +327,8 @@ class NeedleBenchParallelEvaluator(BaseEvaluator): } result = { - **flattened_scores, 'details': details, - 'average_score': average_score + **flattened_scores, + 'details': details, + 'average_score': average_score, } return result diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 617c796c..aa75ea4d 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -265,6 +265,12 @@ DATASETS_MAPPING = { "hf_id": "opencompass/xsum", "local": "./data/Xsum/dev.jsonl", }, + # Needlebench + "opencompass/needlebench": { + "ms_id": "", + "hf_id": "opencompass/needlebench", + "local": "./data/needlebench", + } } DATASETS_URL = { @@ -396,4 +402,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip", "md5": "e3200c7380f4cea5f13c768f2815fabb", }, + "/needlebench": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip", + "md5": "b546da0397746eaff4d3ff0f20d6ede2", + } } From 45efdc994da33947ccfb388842a94328fabb9031 Mon Sep 17 00:00:00 2001 From: Maxime SHE <72982050+chengyingshe@users.noreply.github.com> Date: Thu, 5 Sep 2024 17:51:16 +0800 Subject: [PATCH 6/9] [Feature] Add an attribute api_key into TurboMindAPIModel default None (#1475) Co-authored-by: Maxime Add an attribute api_key into TurboMindAPIModel default None then we can set the api_key while using lmdeploy to deploy the llm model --- configs/eval_internlm_chat_lmdeploy_apiserver.py | 2 ++ opencompass/models/turbomind_api.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/configs/eval_internlm_chat_lmdeploy_apiserver.py b/configs/eval_internlm_chat_lmdeploy_apiserver.py index cc815580..b69b9eab 100644 --- a/configs/eval_internlm_chat_lmdeploy_apiserver.py +++ b/configs/eval_internlm_chat_lmdeploy_apiserver.py @@ -28,6 +28,7 @@ internlm_chat_20b = dict( type=TurboMindAPIModel, abbr='internlm-chat-20b-turbomind', api_addr='http://0.0.0.0:23333', + api_key='internlm-chat-20b', # api_key max_out_len=100, max_seq_len=2048, batch_size=8, @@ -40,6 +41,7 @@ internlm_chat_7b = dict( type=TurboMindAPIModel, abbr='internlm-chat-7b-turbomind', api_addr='http://0.0.0.0:23333', + api_key='interlm-chat-7b', # api_key max_out_len=100, max_seq_len=2048, batch_size=16, diff --git a/opencompass/models/turbomind_api.py b/opencompass/models/turbomind_api.py index a4efd367..fb3f0ac2 100644 --- a/opencompass/models/turbomind_api.py +++ b/opencompass/models/turbomind_api.py @@ -40,6 +40,7 @@ class TurboMindAPIModel(BaseModel): def __init__(self, api_addr: str = 'http://0.0.0.0:23333', + api_key: str | None = None, max_seq_len: int = 2048, meta_template: Optional[Dict] = None, end_str: Optional[str] = None, @@ -48,7 +49,7 @@ class TurboMindAPIModel(BaseModel): max_seq_len=max_seq_len, meta_template=meta_template) from lmdeploy.serve.openai.api_client import APIClient - self.chatbot = APIClient(api_addr) + self.chatbot = APIClient(api_addr, api_key) self.model_name = self.chatbot.available_models[0] self.logger = get_logger() self.template_parser = LMTemplateParser(meta_template) From 00fc8da5be3abaa4f83d1b8dc6cbc58b430a8ca3 Mon Sep 17 00:00:00 2001 From: liushz Date: Thu, 5 Sep 2024 21:10:29 +0800 Subject: [PATCH 7/9] [Feature] Add model postprocess function (#1484) * Add model postprocess function * Add model postprocess function * Add model postprocess function * Add model postprocess function * Add model postprocess function * Add model postprocess function * Add model postprocess function * Add model postprocess function --------- Co-authored-by: liushz --- README.md | 1 + README_zh-CN.md | 1 + .../gsm8k/gsm8k_xfinder_gen_a58960.py | 43 ++++ .../datasets/mmlu/mmlu_xfinder_gen_4d595a.py | 130 ++++++++++++ configs/datasets/nq/nq_xfinder_gen_3dcea1.py | 37 ++++ .../gsm8k/gsm8k_xfinder_gen_a58960.py | 43 ++++ .../datasets/mmlu/mmlu_xfinder_gen_4d595a.py | 130 ++++++++++++ .../datasets/nq/nq_xfinder_gen_3dcea1.py | 37 ++++ opencompass/tasks/openicl_eval.py | 71 ++++++- opencompass/utils/__init__.py | 1 + opencompass/utils/model_postprocessors.py | 77 +++++++ .../utils/postprocessors/xfinder/README.md | 194 ++++++++++++++++++ .../utils/postprocessors/xfinder/__init__.py | 0 .../utils/postprocessors/xfinder/extractor.py | 175 ++++++++++++++++ .../xfinder/xfinder_utils/PROMPT_TEMPLATE.py | 14 ++ .../xfinder/xfinder_utils/__init__.py | 3 + .../xfinder/xfinder_utils/convert_data.py | 123 +++++++++++ .../xfinder/xfinder_utils/data_process.py | 24 +++ 18 files changed, 1101 insertions(+), 3 deletions(-) create mode 100644 configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py create mode 100644 configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py create mode 100644 configs/datasets/nq/nq_xfinder_gen_3dcea1.py create mode 100644 opencompass/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py create mode 100644 opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py create mode 100644 opencompass/configs/datasets/nq/nq_xfinder_gen_3dcea1.py create mode 100644 opencompass/utils/model_postprocessors.py create mode 100644 opencompass/utils/postprocessors/xfinder/README.md create mode 100644 opencompass/utils/postprocessors/xfinder/__init__.py create mode 100644 opencompass/utils/postprocessors/xfinder/extractor.py create mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py create mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py create mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py create mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py diff --git a/README.md b/README.md index d30fa18a..ffcca3fd 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥 - **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥 - **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index 7c3a7a9e..20a131b3 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -69,6 +69,7 @@ ## 🚀 最新进展 +- **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥 - **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥 - **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型,欢迎试用!🔥🔥🔥 diff --git a/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py new file mode 100644 index 00000000..2a2a818c --- /dev/null +++ b/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 +from opencompass.utils.model_postprocessors import xfinder_postprocess + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='math', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py b/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py new file mode 100644 index 00000000..8dff3a57 --- /dev/null +++ b/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py @@ -0,0 +1,130 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import xfinder_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='alphabet_option', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/configs/datasets/nq/nq_xfinder_gen_3dcea1.py b/configs/datasets/nq/nq_xfinder_gen_3dcea1.py new file mode 100644 index 00000000..318c288b --- /dev/null +++ b/configs/datasets/nq/nq_xfinder_gen_3dcea1.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import NaturalQuestionDataset, NQEvaluator +from opencompass.utils.model_postprocessors import xfinder_postprocess + +nq_reader_cfg = dict( + input_columns=['question'], output_column='answer', train_split='test') + +nq_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Question: {question}?\nAnswer: '), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +nq_eval_cfg = dict( + evaluator=dict(type=NQEvaluator), pred_role='BOT', + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='short_text', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +nq_datasets = [ + dict( + type=NaturalQuestionDataset, + abbr='nq', + path='opencompass/natural_question', + reader_cfg=nq_reader_cfg, + infer_cfg=nq_infer_cfg, + eval_cfg=nq_eval_cfg) +] diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py b/opencompass/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py new file mode 100644 index 00000000..2a2a818c --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_xfinder_gen_a58960.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 +from opencompass.utils.model_postprocessors import xfinder_postprocess + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='math', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py b/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py new file mode 100644 index 00000000..8dff3a57 --- /dev/null +++ b/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py @@ -0,0 +1,130 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import xfinder_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='alphabet_option', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/nq/nq_xfinder_gen_3dcea1.py b/opencompass/configs/datasets/nq/nq_xfinder_gen_3dcea1.py new file mode 100644 index 00000000..318c288b --- /dev/null +++ b/opencompass/configs/datasets/nq/nq_xfinder_gen_3dcea1.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import NaturalQuestionDataset, NQEvaluator +from opencompass.utils.model_postprocessors import xfinder_postprocess + +nq_reader_cfg = dict( + input_columns=['question'], output_column='answer', train_split='test') + +nq_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Question: {question}?\nAnswer: '), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +nq_eval_cfg = dict( + evaluator=dict(type=NQEvaluator), pred_role='BOT', + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='short_text', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +nq_datasets = [ + dict( + type=NaturalQuestionDataset, + abbr='nq', + path='opencompass/natural_question', + reader_cfg=nq_reader_cfg, + infer_cfg=nq_infer_cfg, + eval_cfg=nq_eval_cfg) +] diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 85302988..2f5a0957 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -198,6 +198,26 @@ class OpenICLEvalTask(BaseTask): else: pred_strs = [proc(s, **kwargs) for s in pred_strs] + model_pred_strs = [] + if 'model_postprocessor' in self.eval_cfg: + references = (test_set[self.output_column] + if self.output_column else None) + model_pred_dicts = copy.deepcopy(pred_dicts) + for i, pred_dict in enumerate(model_pred_dicts): + pred_dict['reference'] = [references[i]] + self.logger.info('Postprocessing model predictions...') + kwargs = self.eval_cfg['model_postprocessor'] + proc = kwargs.pop('type') + if isinstance(proc, str): + proc = TEXT_POSTPROCESSORS.get(proc) + if pred_list_flag: + model_pred_strs = [[ + proc(model_pred_dict, **kwargs) + for model_pred_dict in model_pred_dicts + ]] + else: + model_pred_strs = proc(model_pred_dicts, **kwargs) + # Get majority voting predictions if use self-consistency if sc_size is not None: pred_strs = [ @@ -229,12 +249,29 @@ class OpenICLEvalTask(BaseTask): } result = icl_evaluator.score(**preds) + # Get model postprocess result + model_details = None + model_result = None + if 'model_postprocessor' in self.eval_cfg: + model_preds = copy.deepcopy(preds) + model_preds['predictions'] = model_pred_strs + model_result = icl_evaluator.score(**model_preds) + for key in model_result: + if key == 'details': + model_details = model_result[key] + continue + new_key = 'model_postprocess_' + key + result[new_key] = model_result[key] + if self.dump_details: details = result.get('details', None) try: result['details'] = self.format_details( - pred_strs, test_set[self.output_column], details, + pred_strs, model_pred_strs, + test_set[self.output_column], details, model_details, pred_dicts) + self.logger.warning( + f"result['details'] : {result['details']}"), result['type'] = result['details'].pop('type', None) if self.cal_extract_rate: # Calculate the extraction success rate for prediction @@ -253,13 +290,27 @@ class OpenICLEvalTask(BaseTask): self.logger.error( f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}') return - else: + elif model_result is None: result_wo_details = { i: result[i] for i in result if i != 'details' } self.logger.info( f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}') + else: + result_wo_details = { + i: result[i] + for i in result if i != 'details' + } + model_result_wo_details = { + i: model_result[i] + for i in model_result if i != 'details' + } + self.logger.info( + f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}') + self.logger.info( + 'Model Postprocess Task: ' + + f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}') # Save result out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg, @@ -286,7 +337,8 @@ class OpenICLEvalTask(BaseTask): success_rate = 100 - len(invalid_extractions) / len(details) * 100 return success_rate - def format_details(self, predictions, references, details, pred_dicts): + def format_details(self, predictions, model_pred_strs, references, details, + model_details, pred_dicts): """This function is responsible for formatting prediction details. Args: @@ -323,6 +375,19 @@ class OpenICLEvalTask(BaseTask): result['predictions'] = str(predictions[i]) result['references'] = str(references[i]) result['correct'] = str(predictions[i]) == str(references[i]) + elif details is not None and model_details is not None: + assert model_pred_strs != [], \ + 'Model details is not None, but model_pred_strs is empty' + self.logger.info( + f"model_details[i]['pred']: {model_details[i]['pred']}") + results['type'] = 'GEN' + result['prompt'] = origin_prediction['origin_prompt'] + result['origin_prediction'] = pred_dicts[i]['prediction'] + result['predictions'] = details[i]['pred'] + result['model_extract_predictions'] = model_details[i]['pred'] + result['references'] = details[i]['answer'] + result['correct'] = details[i]['correct'] + result['model_extract_correct'] = model_details[i]['correct'] elif details is not None: results['type'] = 'GEN' result['prompt'] = origin_prediction['origin_prompt'] diff --git a/opencompass/utils/__init__.py b/opencompass/utils/__init__.py index 466e6222..0e424f4f 100644 --- a/opencompass/utils/__init__.py +++ b/opencompass/utils/__init__.py @@ -9,5 +9,6 @@ from .fileio import * # noqa from .lark import * # noqa from .logging import * # noqa from .menu import * # noqa +from .model_postprocessors import * # noqa from .prompt import * # noqa from .text_postprocessors import * # noqa diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py new file mode 100644 index 00000000..887ffa2c --- /dev/null +++ b/opencompass/utils/model_postprocessors.py @@ -0,0 +1,77 @@ +from functools import partial +from multiprocessing import Pool +from typing import Union + +from tqdm import tqdm + +from opencompass.registry import TEXT_POSTPROCESSORS + +from .postprocessors.xfinder.extractor import Extractor +from .postprocessors.xfinder.xfinder_utils import (DataProcessor, + convert_to_xfinder_format) + + +def gen_output(ori_data, extractor): + ext_cor_pairs = [] + extracted_data = [] + extracted_answers = [] + for item in tqdm(ori_data): + user_input = extractor.prepare_input(item) + extracted_answer = extractor.gen_output(user_input) + ext_cor_pairs.append([ + item['key_answer_type'], item['standard_answer_range'], + extracted_answer, item['correct_answer'] + ]) + item['xfinder_extracted_answer'] = extracted_answer + extracted_answers.append(extracted_answer) + extracted_data.append(item) + + return extracted_answers, ext_cor_pairs, extracted_data + + +@TEXT_POSTPROCESSORS.register_module('xfinder') +def xfinder_postprocess(preds: list, question_type: str, + xfinder_model_name: str, + xfiner_api_url: Union[str, list], **kwargs) -> list: + """Postprocess the text extracted by xFinder model. + Args: + preds (list): The question, reference answer and model prediction. + question_type (str): The type of the question. + url (Union[str, list]): The api url of the xFinder model. + + + Returns: + list: The postprocessed texts. + """ + + def _eval_pred(texts, data_processor, extractor, num_processes=8): + ori_data = data_processor.read_data(texts) + extracted_correct_pairs = [] + extracted_data = [] + extracted_answers = [] + batched_ori_data = [] + # Split data into batches + num_processes = min(num_processes, len(ori_data)) + batch_size = len(ori_data) // num_processes + for i in range(0, len(ori_data), batch_size): + batched_ori_data.append(ori_data[i:i + batch_size]) + with Pool(num_processes) as p: + results = p.map(partial(gen_output, extractor=extractor), + batched_ori_data) + for result in results: + extracted_answers += result[0] + extracted_correct_pairs += result[1] + extracted_data += result[2] + return extracted_answers + + format_data = convert_to_xfinder_format(question_type, preds) + assert xfiner_api_url is not None, 'Please provide the api url.' + data_processor = DataProcessor() + extractor = Extractor(model_name=xfinder_model_name, + url=xfiner_api_url.split(',') + if ',' in xfiner_api_url else xfiner_api_url) + calc_acc_func = partial(_eval_pred, + data_processor=data_processor, + extractor=extractor) + extracted_answers = calc_acc_func(format_data) + return extracted_answers diff --git a/opencompass/utils/postprocessors/xfinder/README.md b/opencompass/utils/postprocessors/xfinder/README.md new file mode 100644 index 00000000..4f7b4ace --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/README.md @@ -0,0 +1,194 @@ +## Extract Final Answers with Postprocess Models + +OpenCompass now support postprocess (extract) prediction answers with postprocess models, to get the true ability level of models. Now, we use [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first postprocess model to extract the final answers from the model outputs. + +We support four types of task types now: + +1. **math**: for math questions with numerical pr formula answers, like GSM8k, Math, etc. +2. **alphabet_option**: for alphabet option questions with alphabet answers, like CommonsenseQA, MMLU, etc. +3. **short_text**: for questions answer type is a short text with selected short text answers. + +Here are some examples of the question type supported from the official xFinder repo: + +```json +[ + { + "model_name": "Llama3_8B_Instruct", + "dataset": "MetaMathQA", + "key_answer_type": "math", + "question": "Find the smallest, positive four-digit multiple of $15.$", + "llm_output": "The smallest four-digit multiple of $15$ is $\\boxed{1005}.$", + "correct_answer": "1005", + "standard_answer_range": "a(n) number / set / vector / matrix / interval / expression / function / equation / inequality" + }, + { + "model_name": "LLaMA2_7B_Chat", + "dataset": "SIQA_text", + "key_answer_type": "short_text", + "question": "Jordan kept Carson up all night talking because Jordan's girlfriend left them and they were quite upset. How would Carson feel as a result? Answer Choices: drowsy / upset their girlfriend left / energetic", + "llm_output": "Based on the information provided in the question, Carson would likely feel upset because Jordan's girlfriend left them and they were quite upset. Therefore, the correct answer is (B) upset their girlfriend left.", + "correct_answer": "drowsy", + "standard_answer_range": [ + "drowsy", + "upset their girlfriend left", + "energetic" + ] + }, + { + "model_name": "PHI2", + "dataset": "CommonsenseQA_alpha", + "key_answer_type": "alphabet_option", + "question": "Why would you go to an office? Answer Choices: (A) work (B) school building (C) paper (D) city (E) habit", + "llm_output": "A) work (B) school building (C) paper (D) city (E) habit\n\nExplanation:\n We are looking for a reason why someone would go to an office. The only answer choice that matches this is option A) work. Therefore, the correct answer is A) work.", + "correct_answer": "A", + "standard_answer_range": [ + [ + "A", + "work" + ], + [ + "B", + "school building" + ], + [ + "C", + "paper" + ], + [ + "D", + "city" + ], + [ + "E", + "habit" + ] + ] + } +] +``` + +## How to Use Model Postprocess in OpenCompass + +### Step 1: Deploy the Postprocess Model Server + +For now, there are two xFinder models can use, you can download them from Huggingface model hub: + +1. **IAAR-Shanghai/xFinder-qwen1505** +2. **IAAR-Shanghai/xFinder-llama38it** + +You can use LMDeploy or vLLM to deploy the xFinder model server, for example, you can use the following command to deploy the xFinder model server with LMDeploy: + +```bash +lmdeploy serve api_server IAAR-Shanghai/xFinder-qwen1505 --model-name xFinder-qwen1505 --server-port 23333 --backend turbomind --tp 1 +``` + +### Step 2: Set the Postprocess Model Config in the Dataset Configuration + +We make the postprocess as a common postprocess function in OpenCompass, so you can use it by setting the `postprocess` parameter in the `predict` function of OpenCompass. It can be used with the default postprocess regularization extract function at the same time. The only thing you need to do is to deploy the postprocess model server and set the `model_postprocessor` to the original `eval_cfg` in the dataset configuration, like the following example: + +```python +from opencompass.utils.model_postprocessors import xfinder_postprocess + +... + + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='math', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') +``` + +Explanation of the parameters: + +- `question_type`: the type of the question, which can be one of the three types mentioned above. +- `xfinder_model_name`: the name of the model you deploying the model server. +- `xfiner_api_url`: the URL of the model server, you can set multiple URLs with `,` to use multiple model servers, which can accelerate the postprocess speed. + +📢:**Please attention following points**: + +1. Now only support extract questions with Zero-shot setting. +2. For alphabet_option problems, the option should be like '\\nA. xxx\\nB. xxx\\nC. xxx\\nD. xxx\\nE. xxx\\n ...' or '\\n(A) xxx\\n(B) xxx\\n(C) xxx\\n(D) xxx\\n(E) xxx\\n ...' format, and the correct answer should be the alphabet of the correct answer, like 'A', 'B', 'C', 'D', 'E'. + +For more details about the xFinder model, you can refer to the [xFinder](https://github.com/IAAR-Shanghai/xFinder), and for a complete example, you can refer to the following example, which is the configuration of the GSM8K dataset with the xFinder postprocess model: + +```python +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 +from opencompass.utils.model_postprocessors import xfinder_postprocess + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='math', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] +``` + +For evaluation results, `accuracy` is the result using default postprocess, and `model_postprocess_accuracy` is the result using xFinder postprocess, the gap can be wider when the model is not good answering the questions properly. + +You can also use the `--dump-eval-details` command to dump the detailed evaluation details to see the model postprocess results from the `results` folder. + +## Results Comparison with Different Question Types + +We have tested the model postprocess method with XFinder model on the GSM8K, MMLU, Natural Questions (NQ) datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows: + +| Dataset | Type | Config Name | Regex Postprocess Score | Model Postprocess Score | +| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- | +| gsm8k | math | gsm8k_xfinder_gen_a58960 | 73.46 | 78.09 | +| nq | short_text | nq_xfinder_gen_3dcea1 | 22.33 | 37.53 | +| mmlu | alphabet_option | mmlu_xfinder_gen_4d595a | 67.89 | 67.93 | + +## Citation + +```bibtex +@misc{2023opencompass, + title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, + author={OpenCompass Contributors}, + howpublished = {\url{https://github.com/open-compass/opencompass}}, + year={2023} +} + +@misc{yu2024xfinderrobustpinpointanswer, + title={xFinder: Robust and Pinpoint Answer Extraction for Large Language Models}, + author={Qingchen Yu and Zifan Zheng and Shichao Song and Zhiyu Li and Feiyu Xiong and Bo Tang and Ding Chen}, + year={2024}, + eprint={2405.11874}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2405.11874}, +} + +``` diff --git a/opencompass/utils/postprocessors/xfinder/__init__.py b/opencompass/utils/postprocessors/xfinder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/opencompass/utils/postprocessors/xfinder/extractor.py b/opencompass/utils/postprocessors/xfinder/extractor.py new file mode 100644 index 00000000..de2abb08 --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/extractor.py @@ -0,0 +1,175 @@ +import json +import time +from logging import getLogger + +import requests +from openai import OpenAI + +from .xfinder_utils import PROMPT_TEMPLATE + +Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question. +First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer]. +Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range. +Below are some special cases you need to be aware of: + (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer]. + (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer]. + +""" # noqa + + +class Extractor: + + def __init__( + self, + model_name, + model_path=None, + url=None, + temperature=0, + max_tokens=3000, + api_key='EMPTY', + SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.' # noqa + ): + self.model_name = model_name + self.PROMPT_TEMPLATE = PROMPT_TEMPLATE[model_name] + self.SYSTEM = SYSTEM + self.model_path = model_path + self.url = url + self.api_key = api_key + self.temperature = temperature + self.max_tokens = max_tokens + self.mode = 'API' if self.url is not None else 'Local' + self.logger = getLogger(__name__) + + if self.mode == 'Local': + from vllm import LLM, SamplingParams + self.sampling_params = SamplingParams(temperature=self.temperature, + max_tokens=self.max_tokens, + stop=[ + '<|endoftext|>', + '<|im_end|>', '', + '<||>', '', + '<|eot_id|>' + ]) + self.llm = LLM(model=self.model_path, gpu_memory_utilization=0.5) + + @staticmethod + def prepare_input(item): + user_input = Instruction + \ + "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \ + "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \ + 'Answer range: ' + item['standard_answer_range'] + '\n\n' + \ + 'Key extracted answer: ' + + return user_input + + def gen_output(self, query): + if self.mode == 'API': + # return self.send_request(query) + return self.openai_infer(query) + else: + return self.offline_infer(query) + + def send_request(self, query: str) -> str: + """Send a request to the model's API and return the response. + + Args: + query (str): The input query. + + Returns: + str: The extracted answer (xFinder's output). + """ + prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query) + payload = json.dumps({ + 'prompt': + prompt, + 'temperature': + self.temperature, + 'max_tokens': + self.max_tokens, + 'stop': [ + '<|endoftext|>', '<|im_end|>', '', '<||>', + '', '<|eot_id|>' + ], + }) + headers = {'Content-Type': 'application/json'} + res = requests.request('POST', self.url, headers=headers, data=payload) + res = res.json()['text'][0] + res = res.replace(prompt, '') + # res = requests.post(self.url, json=payload) + # res = res.json()['text'] + res = res.strip() + return res + + def openai_infer(self, query: str, retry=9) -> str: + """Perform inference on the OpenAI model. + + Args: + query (str): The input query. + + Returns: + str: The extracted answer (xFinder's output). + """ + if isinstance(self.url, list): + # Randomly api for better load balancing + import random + self.url = random.choice(self.url) + self.client = OpenAI( + api_key=self.api_key, + base_url=self.url, + ) + self.retry = retry + + t = time.time() + retry = self.retry + response = '' + while retry > 0: + try: + chat_response = self.client.chat.completions.create( + model=self.client.models.list().data[0].id + if self.model_name == '' else self.model_name, + messages=[ + { + 'role': 'system', + 'content': self.SYSTEM + }, + { + 'role': 'user', + 'content': query + }, + ], + stop=[ + '<|endoftext|>', '<|im_end|>', '', '<||>', + '', '<|eot_id|>' + ], + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + js_response = json.loads(chat_response.model_dump_json()) + response = js_response['choices'][0]['message']['content'] + break + except Exception as e: + self.logger.info(f'Error: {e}') + self.logger.info(f'{self.url} is down. Retrying...') + self.logger.info(f'Time elapsed: {time.time() - t} seconds') + time.sleep(6) + retry -= 1 + if retry == 0: + response = 'Error: Failed to get response.' + self.logger.info(f'{response} after {self.retry} tries.') + raise ValueError('The api is down') + return response.strip() + + def offline_infer(self, query: str) -> str: + """Perform inference on the local xFinder model. + + Args: + query (str): The input query. + + Returns: + str: The extracted answer (xFinder's output). + """ + prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query) + res = self.llm.generate(prompt, self.sampling_params) + res = res[0] + res = res.outputs[0].text.strip() + return res diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py new file mode 100644 index 00000000..3fb19fd2 --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py @@ -0,0 +1,14 @@ +PROMPT_TEMPLATE = { + 'xFinder-qwen1505': + """<|System|>:{system} +<|User|>:{input} +<|Bot|>:""", + 'xFinder-llama38it': + """<|start_header_id|>system<|end_header_id|> + +{system}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +""", +} diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py new file mode 100644 index 00000000..b875d98d --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py @@ -0,0 +1,3 @@ +from .convert_data import * # noqa +from .data_process import * # noqa +from .PROMPT_TEMPLATE import * # noqa diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py new file mode 100644 index 00000000..ace88222 --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py @@ -0,0 +1,123 @@ +# Convert OpenCompass prediction data to XFinder format +import copy +import json +import re + +xfinder_template = { + 'math': { + 'model_name': + '', + 'dataset': + '', + 'key_answer_type': + 'math', + 'question': + '', + 'llm_output': + '', + 'correct_answer': + '', + 'standard_answer_range': + 'a(n) number / set / vector / matrix / interval / expression / function / equation / inequality' # noqa + }, + 'alphabet_option': { + 'model_name': '', + 'dataset': '', + 'key_answer_type': 'alphabet_option', + 'question': '', + 'llm_output': '.', + 'correct_answer': '', + 'standard_answer_range': [] + }, + 'categorical_label': { + 'model_name': '', + 'dataset': '', + 'key_answer_type': '', + 'question': '', + 'llm_output': '', + 'correct_answer': '', + 'standard_answer_range': [] + }, + 'short_text': { + 'model_name': '', + 'dataset': '', + 'key_answer_type': 'short_text', + 'question': '', + 'llm_output': '', + 'correct_answer': '', + 'standard_answer_range': [] + } +} + + +def parse_options(text: str): + lines = text.split('\n') + parsed_options = [] + option_pattern = r'^[A-Z]\)|[A-Z]\.|[A-Z]\)|[A-Z]:|\([A-Z]\)' + for line in lines: + line = line.strip() + match = re.match(option_pattern, line) + if match: + option = '' + # 等于第一个属于选项的字符 + for c in line: + if c.isalpha(): + option = c + break + content_start = match.end() + 1 + content = line[content_start:].strip() + parsed_options.append([option, content]) + + return parsed_options + + +def convert_to_xfinder_format(typ, data, model_name='', dataset_name=''): + assert typ in xfinder_template.keys(), f'Invalid type {typ}' + format_data = [] + for item in data: + template = copy.deepcopy(xfinder_template[typ]) + question = item['origin_prompt'][-1]['prompt'] + llm_output = item['prediction'] + correct_answer = item['reference'] if item['reference'] else item[ + 'gold'] + template['correct_answer'] = correct_answer + template['model_name'] = model_name + template['dataset'] = dataset_name + template['question'] = question + template['llm_output'] = llm_output + try: + assert typ in list(xfinder_template.keys()) + if typ == 'alphabet_option': + options = parse_options(question) + template['standard_answer_range'] = options + elif typ == 'short_text': + template['standard_answer_range'] = item['gold'] + elif typ == 'categorical_label': + pass + except Exception as e: + print(f'Error when parsing question options: {e}, skipping...') + continue + + format_data.append(template) + return format_data + + +if __name__ == '__main__': + # Test + example_data = { + 'origin_prompt': [{ + 'role': + 'HUMAN', + 'prompt': + 'Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Ophelia, Bob is dancing with Jamie, Claire is dancing with Melissa, Dave is dancing with Rodrigo, and Eve is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Claire and Bob switch partners. Then, Claire and Eve switch partners. Then, Claire and Bob switch partners. Then, Eve and Dave switch partners. Finally, Claire and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Ophelia\n(B) Jamie\n(C) Melissa\n(D) Rodrigo\n(E) Patrick' # noqa + }], + 'origin_prediction': + '\n 答案: B) 前者小于后者', + 'prediction': + 'B', + 'reference': + 'A' + } + example_data = convert_to_xfinder_format('alphabet_option', [example_data], + 'GPT-3', 'OpenAI') + print(json.dumps(example_data, indent=4, ensure_ascii=False)) diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py new file mode 100644 index 00000000..0cacd08b --- /dev/null +++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py @@ -0,0 +1,24 @@ +import ast + + +class DataProcessor: + + def __init__(self): + pass + + def read_data(self, data): + for item in data: + if isinstance(item['standard_answer_range'], + str) and item['key_answer_type'] != 'math': + try: + item['standard_answer_range'] = ast.literal_eval( + item['standard_answer_range']) + except Exception as e: + print(f'Error: {e}') + print('Please check the form of standard_answer_range') + exit(0) + + item['standard_answer_range'] = str(item['standard_answer_range']) + item['key_answer_type'] = str(item['key_answer_type']) + + return data From faf5260155beb3aac0d4cfe93c8ba198edee8437 Mon Sep 17 00:00:00 2001 From: Hari Seldon <95674173+HariSeldon0@users.noreply.github.com> Date: Fri, 6 Sep 2024 00:59:41 +0800 Subject: [PATCH 8/9] [Feature] Optimize Evaluation Speed of SciCode (#1489) * update scicode * update comments * remove redundant variable * Update --------- Co-authored-by: tonysy --- opencompass/datasets/scicode.py | 74 ++++++++++++++++++------------ opencompass/utils/datasets_info.py | 4 +- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/opencompass/datasets/scicode.py b/opencompass/datasets/scicode.py index e6a09e8b..bf8ee96e 100644 --- a/opencompass/datasets/scicode.py +++ b/opencompass/datasets/scicode.py @@ -1,3 +1,4 @@ +import concurrent.futures import json import os import os.path as osp @@ -95,10 +96,10 @@ def process_hdf5_datagroup(group): def process_hdf5_to_tuple(step_id, test_num): - H5PY_FILE_FOLDER = './data/scicode/' + H5PY_FILE_FOLDER = './data/scicode/test_data' H5PY_FILE_FOLDER = get_data_path(H5PY_FILE_FOLDER, local_mode=True) data_lst = [] - H5PY_FILE = os.path.join(H5PY_FILE_FOLDER, 'test_data.h5') + H5PY_FILE = os.path.join(H5PY_FILE_FOLDER, f'{step_id}.h5') assert os.path.exists( H5PY_FILE ), f"Please manually download 'test_data.h5' from https://github.com/open-compass/storage/releases/download/v0.1.0/scicode_test_data.zip and put the file in {H5PY_FILE}" # noqa: E501 @@ -217,7 +218,7 @@ def cmp_tuple_or_list(var1, var2): @ICL_EVALUATORS.register_module() class SciCodeEvaluator(BaseEvaluator): - def __init__(self, dataset_path, with_bg, testcode_path='./tmp/scicode'): + def __init__(self, dataset_path, with_bg): super().__init__() test_data = [] dataset_path = get_data_path(dataset_path, local_mode=True) @@ -229,8 +230,6 @@ class SciCodeEvaluator(BaseEvaluator): with open(file_path, 'r', encoding='utf-8') as file: test_data = json.load(file) self.dataset = Dataset.from_list(test_data) - self.testcode_path = testcode_path - H5PY_FILE = osp.join(dataset_path, 'test_data.h5') # noqa: F841 def extract_python_script(self, response: str): start_marker = '```python' @@ -271,25 +270,20 @@ class SciCodeEvaluator(BaseEvaluator): return 2 def score(self, predictions, references): - correct, sub_correct = 0, 0 - count, sub_count = 0, 0 - details = [] - - # generate all python test codes and than test + # generate all python test codes for idx, prediction_list in enumerate(predictions): # traverse each test sample problem_id = self.dataset[idx]['id'] num_of_subproblems = len(prediction_list) # create dir for each test sample - testdir_path = os.path.join(self.testcode_path, str(problem_id)) + testdir_path = os.path.join(self._out_dir, str(problem_id)) os.makedirs(testdir_path, exist_ok=True) python_code = '' # add import statement python_code += self.dataset[idx]['import'] - is_all_correct = True for sub_idx in range(num_of_subproblems): # extract code response = prediction_list[sub_idx] @@ -319,30 +313,50 @@ from opencompass.datasets.scicode import process_hdf5_to_tuple '\n') for idx2 in range(len(test_lst)): f.write(f'target = targets[{idx2}]\n\n') - for line in test_lst[idx2].split('\n'): - f.write(line + '\n') + for line in test_lst[idx2].split('\n'): + f.write(line + '\n') - # test - ret = self.run_script(testfile_path) - msg = {'problem': f'{problem_id}-{sub_idx + 1}'} - if ret == 0: # correct - sub_correct += 1 - msg['is_correct'] = True - elif ret == 1: # error - is_all_correct = False - msg['is_correct'] = False - else: # time out - is_all_correct = False - msg['is_correct'] = False - sub_count += 1 - details.append(msg) + # find all scripts + python_scripts = [] + for root, dirs, files in os.walk(self._out_dir): + for file in files: + if file.endswith('.py'): + python_scripts.append(os.path.join(root, file)) - correct += is_all_correct + # Use ThreadPoolExecutor to concurrently execute scripts + with concurrent.futures.ThreadPoolExecutor() as executor: + # Submit task and obtain Future object + futures = [ + executor.submit(self.run_script, script) + for script in python_scripts + ] + + results = [] + for future in concurrent.futures.as_completed(futures): + result = future.result() + results.append(result) + + all_results = {} + for script_path, result in zip(python_scripts, results): + basename = os.path.basename(script_path) + main_id = basename.split('-')[0] + if all_results.get(main_id): + all_results[main_id].append(result) + else: + all_results[main_id] = [result] + + correct, sub_correct = 0, 0 + count, sub_count = 0, 0 + + for main_id in all_results: + correct += sum(all_results[main_id]) == 0 count += 1 + for sub in all_results[main_id]: + sub_correct += sub == 0 + sub_count += 1 result = { 'accuracy': 100 * correct / count, 'sub_accuracy': 100 * sub_correct / sub_count, - 'details': details } return result diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index aa75ea4d..8ee208ea 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -370,9 +370,9 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", "md5": "c60bdfff3d02358067104cc1dea7c0f7", }, - "scicode/": { + "/scicode": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", - "md5": "06f64edad6680072e5bca3f0ce892d0c", + "md5": "9c6c64b8c70edc418f713419ea39989c", }, "/commonsenseqa": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip", From 928d0cfc3ac6b1727d814a23cb861ad15bad51d5 Mon Sep 17 00:00:00 2001 From: Albert Yan Date: Fri, 6 Sep 2024 02:00:43 +0900 Subject: [PATCH 9/9] [Feature] Add support for Rendu API (#1468) * Add support for Rendu API * fix lint issue * fix lint issue * fix lint issue * Update --------- Co-authored-by: 13190 Co-authored-by: tonysy --- configs/api_examples/eval_api_rendu.py | 39 ++++++ opencompass/models/__init__.py | 1 + opencompass/models/rendu_api.py | 175 +++++++++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 configs/api_examples/eval_api_rendu.py create mode 100644 opencompass/models/rendu_api.py diff --git a/configs/api_examples/eval_api_rendu.py b/configs/api_examples/eval_api_rendu.py new file mode 100644 index 00000000..2b2a2647 --- /dev/null +++ b/configs/api_examples/eval_api_rendu.py @@ -0,0 +1,39 @@ +from mmengine.config import read_base +from opencompass.models import Rendu +from opencompass.partitioners import NaivePartitioner +from opencompass.runners.local_api import LocalAPIRunner +from opencompass.tasks import OpenICLInferTask + +with read_base(): + from ..summarizers.medium import summarizer + from ..datasets.ceval.ceval_gen import ceval_datasets + +datasets = [ + *ceval_datasets +] + +models = [ + dict( + abbr='Rendu', + type=Rendu, + path='rendu', + key='xxxxxx', + url='xxxxxx', + generation_kwargs={ + 'temperature': 0.1, + 'top_p': 0.9, + }, + query_per_second=10, + max_out_len=2048, + max_seq_len=2048, + batch_size=8), +] + +infer = dict(partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalAPIRunner, + max_num_workers=1, + concurrent_users=1, + task=dict(type=OpenICLInferTask)), ) + +work_dir = 'outputs/api_rendu/' diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index ab75a4ab..0d384fed 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -35,6 +35,7 @@ from .openai_api import OpenAI # noqa: F401 from .openai_api import OpenAISDK # noqa: F401 from .pangu_api import PanGu # noqa: F401 from .qwen_api import Qwen # noqa: F401 +from .rendu_api import Rendu # noqa: F401 from .sensetime_api import SenseTime # noqa: F401 from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 diff --git a/opencompass/models/rendu_api.py b/opencompass/models/rendu_api.py new file mode 100644 index 00000000..a88741a7 --- /dev/null +++ b/opencompass/models/rendu_api.py @@ -0,0 +1,175 @@ +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class Rendu(BaseAPIModel): + """Model wrapper around Rendu. + Documentation: + + Args: + path (str): The name of Rendu model. + e.g. `Rendu` + key (str): Authorization key. + url (str): model url. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + is_api: bool = True + + def __init__(self, + path: str, + key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + generation_kwargs: Dict = { + 'temperature': 0.7, + 'top_p': 0.9, + }): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + generation_kwargs=generation_kwargs) + + self.url = url + self.key = key + self.model = path + self.headers = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + self.key, + } + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + input (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + data = { + 'model': self.model, + 'messages': messages, + } + data.update(self.generation_kwargs) + + max_num_retries = 0 + while max_num_retries < self.retry: + self.acquire() + try: + raw_response = requests.request('POST', + url=self.url, + headers=self.headers, + json=data) + except Exception as err: + print('Request Error:{}'.format(err)) + time.sleep(2) + continue + + response = raw_response.json() + self.release() + + if response is None: + print('Connection error, reconnect.') + # if connect error, frequent requests will casuse + # continuous unstable network, therefore wait here + # to slow down the request + self.wait() + continue + + if raw_response.status_code == 200: + # msg = json.load(response.text) + # response + msg = response['choices'][0]['message']['content'] + return msg + + if raw_response.status_code == 403: + print('请求被拒绝 api_key错误') + continue + elif raw_response.status_code == 400: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The request was rejected because high risk' + return msg + time.sleep(1) + continue + elif raw_response.status_code == 429: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(5) + continue + + max_num_retries += 1 + + raise RuntimeError(raw_response)