Merge branch 'main' of github.com:open-compass/opencompass into olymmath

This commit is contained in:
liushz 2025-04-02 09:17:52 +00:00
commit 4601839ba8
15 changed files with 212 additions and 108 deletions

View File

@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
lcb_test_output_pass@1: 18.75 lcb_test_output_pass@1: 18.75
bbh-logical_deduction_seven_objects_score: 50 bbh-logical_deduction_seven_objects_score: 50
bbh-multistep_arithmetic_two_score: 68.75 bbh-multistep_arithmetic_two_score: 68.75
mmlu-other_naive_average: 72.6 mmlu-other_accuracy: 72.6
cmmlu-china-specific_naive_average: 76.25 cmmlu-china-specific_accuracy: 76.25
mmlu_pro_math_accuracy: 25 mmlu_pro_math_accuracy: 25
ds1000_Pandas_accuracy: 12.5 ds1000_Pandas_accuracy: 12.5
ds1000_Numpy_accuracy: 0 ds1000_Numpy_accuracy: 0
@ -101,8 +101,8 @@ internlm2_5-7b-chat-turbomind_fullbench:
lcb_test_output_pass@1: 25.00 lcb_test_output_pass@1: 25.00
bbh-logical_deduction_seven_objects_score: 50.00 bbh-logical_deduction_seven_objects_score: 50.00
bbh-multistep_arithmetic_two_score: 68.75 bbh-multistep_arithmetic_two_score: 68.75
mmlu-other_naive_average: 69.71 mmlu-other_accuracy: 69.71
cmmlu-china-specific_naive_average: 75.83 cmmlu-china-specific_accuracy: 75.83
mmlu_pro_math_accuracy: 31.25 mmlu_pro_math_accuracy: 31.25
ds1000_Pandas_accuracy: 0 ds1000_Pandas_accuracy: 0
ds1000_Numpy_accuracy: 0 ds1000_Numpy_accuracy: 0
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
sanitized_mbpp_score: 55.25 sanitized_mbpp_score: 55.25
dingo_en_192_score: 60.94 dingo_en_192_score: 60.94
dingo_zh_170_score: 67.65 dingo_zh_170_score: 67.65
mmlu-stem_naive_average: 63.72 mmlu-stem_accuracy: 63.72
mmlu-social-science_naive_average: 80.15 mmlu-social-science_accuracy: 80.15
mmlu-humanities_naive_average: 74.27 mmlu-humanities_accuracy: 74.27
mmlu-other_naive_average: 71.85 mmlu-other_accuracy: 71.85
cmmlu-stem_naive_average: 67.07 cmmlu-stem_accuracy: 67.07
cmmlu-social-science_naive_average: 81.49 cmmlu-social-science_accuracy: 81.49
cmmlu-humanities_naive_average: 85.84 cmmlu-humanities_accuracy: 85.84
cmmlu-other_naive_average: 82.69 cmmlu-other_accuracy: 82.69
cmmlu-china-specific_naive_average: 79.88 cmmlu-china-specific_accuracy: 79.88
mmlu_pro_biology_accuracy: 58.58 mmlu_pro_biology_accuracy: 58.58
mmlu_pro_business_accuracy: 28.01 mmlu_pro_business_accuracy: 28.01
mmlu_pro_chemistry_accuracy: 22.79 mmlu_pro_chemistry_accuracy: 22.79
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
longbench_naive_average: 46.19 longbench_naive_average: 46.19
longbench_zh_naive_average: 49.3 longbench_zh_naive_average: 49.3
longbench_en_naive_average: 43.97 longbench_en_naive_average: 43.97
longbench_single-document-qa_naive_average: 42.84 longbench_single-document-qa_score: 42.84
longbench_multi-document-qa_naive_average: 37.29 longbench_multi-document-qa_score: 41.25
longbench_summarization_naive_average: 23.21 longbench_summarization_score: 23.21
longbench_few-shot-learning_naive_average: 61.67 longbench_few-shot-learning_score: 61.67
longbench_synthetic-tasks_naive_average: 60.05 longbench_synthetic-tasks_score: 60.05
longbench_code-completion_naive_average: 52.09 longbench_code-completion_score: 52.09
internlm2_5-7b-chat-turbomind: internlm2_5-7b-chat-turbomind:
objective: objective:
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
teval_naive_average: 80 teval_naive_average: 80
SciCode_sub_accuracy: 5.56 SciCode_sub_accuracy: 5.56
qa_dingo_cn_score: 99.01 qa_dingo_cn_score: 99.01
mmlu-stem_naive_average: 68.2 mmlu-stem_accuracy: 68.2
mmlu-social-science_naive_average: 75.8 mmlu-social-science_accuracy: 75.8
mmlu-humanities_naive_average: 69.3 mmlu-humanities_accuracy: 69.3
mmlu-other_naive_average: 71.3 mmlu-other_accuracy: 71.3
cmmlu-stem_naive_average: 66.64 cmmlu-stem_accuracy: 66.64
cmmlu-social-science_naive_average: 76 cmmlu-social-science_accuracy: 76
cmmlu-humanities_naive_average: 77.9 cmmlu-humanities_accuracy: 77.9
cmmlu-other_naive_average: 77.25 cmmlu-other_accuracy: 77.25
cmmlu-china-specific_naive_average: 73.6 cmmlu-china-specific_accuracy: 73.6
mmlu_pro_biology_accuracy: 66.67 mmlu_pro_biology_accuracy: 66.67
mmlu_pro_business_accuracy: 47.91 mmlu_pro_business_accuracy: 47.91
mmlu_pro_chemistry_accuracy: 35 mmlu_pro_chemistry_accuracy: 35
@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind:
babilong_32k_naive_average: 48.9 babilong_32k_naive_average: 48.9
babilong_128k_naive_average: 40.8 babilong_128k_naive_average: 40.8
babilong_256k_naive_average: 23.5 babilong_256k_naive_average: 23.5
longbench_single-document-qa_naive_average: 43.56 longbench_single-document-qa_score: 43.56
longbench_multi-document-qa_naive_average: 46.24 longbench_multi-document-qa_score: 46.24
longbench_summarization_naive_average: 24.32 longbench_summarization_score: 24.32
longbench_few-shot-learning_naive_average: 51.67 longbench_few-shot-learning_score: 51.67
longbench_synthetic-tasks_naive_average: 66.83 longbench_synthetic-tasks_score: 66.83
longbench_code-completion_naive_average: 45.99 longbench_code-completion_score: 45.99

View File

@ -157,7 +157,9 @@ jobs:
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}} pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data

View File

@ -45,7 +45,7 @@ jobs:
. ${{env.CONDA_PATH}}/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}} conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs conda info --envs
- name: conda env - name: conda env
run: | run: |

102
README.md
View File

@ -176,69 +176,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass! After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
- Your first evaluation with OpenCompass! ### Your first evaluation with OpenCompass!
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder. OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
```bash ```bash
# CLI # CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
# Python scripts # Python scripts
opencompass examples/eval_chat_demo.py opencompass examples/eval_chat_demo.py
``` ```
You can find more script examples under [examples](./examples) folder. You can find more script examples under [examples](./examples) folder.
- API evaluation ### API evaluation
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings. OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
```bash ```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY" export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# CLI # CLI
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python scripts # Python scripts
opencompass examples/eval_api_demo.py opencompass examples/eval_api_demo.py
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default. # You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
``` ```
- Accelerated Evaluation ### Accelerated Evaluation
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy: Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
```bash ```bash
# CLI # CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
# Python scripts # Python scripts
opencompass examples/eval_lmdeploy_demo.py opencompass examples/eval_lmdeploy_demo.py
``` ```
- Supported Models ### Supported Models and Datasets
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs). OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
```bash ```bash
# List all configurations # List all configurations
python tools/list_configs.py python tools/list_configs.py
# List all configurations related to llama and mmlu # List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu python tools/list_configs.py llama mmlu
``` ```
If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists. #### Supported Models
```bash If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`. ```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
```bash #### Supported Datasets
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
``` Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
```bash
# Recommended Evaluation Config based on Rules
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# Recommended Evaluation Config based on LLM Judge
opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
```
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
```
> \[!TIP\] > \[!TIP\]
> >
@ -288,7 +302,7 @@ You can quickly find the dataset you need from the list through sorting, filteri
In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations. In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details. Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
<p align="right"><a href="#top">🔝Back to top</a></p> <p align="right"><a href="#top">🔝Back to top</a></p>

View File

@ -208,9 +208,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
``` ```
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。 - ### 支持的模型与数据集
- ### 支持的模型 OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
```bash ```bash
# 列出所有配置 # 列出所有配置
@ -219,13 +219,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
python tools/list_configs.py llama mmlu python tools/list_configs.py llama mmlu
``` ```
如果模型不在列表中但支持 Huggingface AutoModel 类,您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。 #### 支持的模型
如果模型不在列表中,但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装(详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)),您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
```bash ```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
``` ```
如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。 #### 支持的数据集
目前OpenCompass针对数据集给出了标准的推荐配置。通常`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
```bash
# 基于规则的推荐配置
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# 基于LLM Judge的推荐配置
opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
```
此外,如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
```bash ```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2 CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@ -281,9 +295,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。 您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
另外我们为每个数据集都提供了一种推荐配置部分数据集还支持了基于LLM Judge的配置。 详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
<p align="right"><a href="#top">🔝返回顶部</a></p> <p align="right"><a href="#top">🔝返回顶部</a></p>

View File

@ -121,7 +121,7 @@
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2310.16049 paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr/musr_gen.py configpath: opencompass/configs/datasets/musr/musr_gen.py
configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
- needlebench: - needlebench:
name: NeedleBench name: NeedleBench
category: Long Context category: Long Context

View File

@ -32,12 +32,23 @@ with open(load_path, 'r') as f2:
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge'] HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr'
]
def table_format(data_list): def table_format(data_list):
table_format_list = [] table_format_list = []
for i in data_list: for i in data_list:
table_format_list_sub = [] table_format_list_sub = []
for j in i: for j in i:
if j in recommanded_dataset_list:
link_token = '[link]('
else:
link_token = '[link(TBD)]('
for index in HEADER: for index in HEADER:
if index == 'paper': if index == 'paper':
table_format_list_sub.append('[link](' + i[j][index] + ')') table_format_list_sub.append('[link](' + i[j][index] + ')')
@ -45,18 +56,18 @@ def table_format(data_list):
if i[j][index] == '': if i[j][index] == '':
table_format_list_sub.append(i[j][index]) table_format_list_sub.append(i[j][index])
else: else:
table_format_list_sub.append('[link](' + table_format_list_sub.append(link_token +
GITHUB_PREFIX + GITHUB_PREFIX +
i[j][index] + ')') i[j][index] + ')')
elif index == 'configpath': elif index == 'configpath':
if isinstance(i[j][index], list): if isinstance(i[j][index], list):
sub_list_text = '' sub_list_text = ''
for k in i[j][index]: for k in i[j][index]:
sub_list_text += ('[link](' + GITHUB_PREFIX + k + sub_list_text += (link_token + GITHUB_PREFIX + k +
') / ') ') / ')
table_format_list_sub.append(sub_list_text[:-2]) table_format_list_sub.append(sub_list_text[:-2])
else: else:
table_format_list_sub.append('[link](' + table_format_list_sub.append(link_token +
GITHUB_PREFIX + GITHUB_PREFIX +
i[j][index] + ')') i[j][index] + ')')
else: else:

View File

@ -30,12 +30,23 @@ with open(load_path, 'r') as f2:
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge'] HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
recommanded_dataset_list = [
'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
'mmlu_pro', 'musr'
]
def table_format(data_list): def table_format(data_list):
table_format_list = [] table_format_list = []
for i in data_list: for i in data_list:
table_format_list_sub = [] table_format_list_sub = []
for j in i: for j in i:
if j in recommanded_dataset_list:
link_token = '[链接]('
else:
link_token = '[链接(TBD)]('
for index in HEADER: for index in HEADER:
if index == 'paper': if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')') table_format_list_sub.append('[链接](' + i[j][index] + ')')
@ -43,17 +54,19 @@ def table_format(data_list):
if i[j][index] == '': if i[j][index] == '':
table_format_list_sub.append(i[j][index]) table_format_list_sub.append(i[j][index])
else: else:
table_format_list_sub.append('[链接](' + GITHUB_PREFIX + table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')') i[j][index] + ')')
elif index == 'configpath': elif index == 'configpath':
if isinstance(i[j][index], list): if isinstance(i[j][index], list):
sub_list_text = '' sub_list_text = ''
for k in i[j][index]: for k in i[j][index]:
sub_list_text += ('[链接](' + GITHUB_PREFIX + k + sub_list_text += (link_token + GITHUB_PREFIX + k +
') / ') ') / ')
table_format_list_sub.append(sub_list_text[:-2]) table_format_list_sub.append(sub_list_text[:-2])
else: else:
table_format_list_sub.append('[链接](' + GITHUB_PREFIX + table_format_list_sub.append(link_token +
GITHUB_PREFIX +
i[j][index] + ')') i[j][index] + ')')
else: else:
table_format_list_sub.append(i[j][index]) table_format_list_sub.append(i[j][index])

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='internvl2_5-38b-turbomind',
path='OpenGVLab/InternVL2_5-38B',
engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=8192,
max_out_len=8192,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='internvl2_5-8b-turbomind',
path='OpenGVLab/InternVL2_5-8B',
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
max_seq_len=8192,
max_out_len=8192,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -48,7 +48,7 @@ def clean_units(pred_str: str):
def number_it(num): def number_it(num):
from latex2sympy2 import latex2sympy from latex2sympy2_extended import latex2sympy
if isinstance(num, (int, float)): if isinstance(num, (int, float)):
return num return num

View File

@ -17,7 +17,7 @@ def time_limit(seconds: float):
def extract_theoremqa_answer(pred: str, answer_flag: bool = True): def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
from latex2sympy2 import latex2sympy from latex2sympy2_extended import latex2sympy
if any([option in pred.lower() for option in ['yes', 'true']]): if any([option in pred.lower() for option in ['yes', 'true']]):
pred = 'True' pred = 'True'

View File

@ -33,6 +33,7 @@ class ClaudeSDK(BaseAPIModel):
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
temperature: Optional[float] = 0.0, temperature: Optional[float] = 0.0,
thinking: Optional[Dict] = None,
retry: int = 2, retry: int = 2,
): ):
super().__init__(path=path, super().__init__(path=path,
@ -49,6 +50,7 @@ class ClaudeSDK(BaseAPIModel):
self.anthropic = Anthropic(api_key=key) self.anthropic = Anthropic(api_key=key)
self.model = path self.model = path
self.temperature = temperature self.temperature = temperature
self.thinking = thinking
def generate( def generate(
self, self,
@ -108,11 +110,26 @@ class ClaudeSDK(BaseAPIModel):
while num_retries < self.retry: while num_retries < self.retry:
self.wait() self.wait()
try: try:
responses = self.anthropic.messages.create( api_params = {
model=self.model, 'model': self.model,
max_tokens=max_out_len, 'max_tokens': max_out_len,
temperature=self.temperature, 'temperature': self.temperature,
messages=messages) 'messages': messages,
}
if self.thinking is not None:
api_params['thinking'] = self.thinking
api_params['stream'] = True
responses = self.anthropic.messages.create(**api_params)
# Handle new response format
for content in responses.content:
if content.type == 'text':
return content.text
# If no text type content is found, return the first
# content (backward compatibility)
return responses.content[0].text return responses.content[0].text
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)

View File

@ -652,7 +652,6 @@ class OpenAISDK(OpenAI):
self.logger.info('Start calling OpenAI API') self.logger.info('Start calling OpenAI API')
responses = self.openai_client.chat.completions.create( responses = self.openai_client.chat.completions.create(
**query_data, timeout=timeout) # timeout in seconds **query_data, timeout=timeout) # timeout in seconds
if self.verbose: if self.verbose:
self.logger.info( self.logger.info(
'Successfully get response from OpenAI API') 'Successfully get response from OpenAI API')
@ -660,10 +659,18 @@ class OpenAISDK(OpenAI):
self.logger.info(responses) self.logger.info(responses)
except Exception: except Exception:
pass # noqa F841 pass # noqa F841
if not responses.choices:
# Check if response is empty or content is empty
if not responses.choices or not responses.choices[
0].message.content:
self.logger.error( self.logger.error(
'Response is empty, it is an internal server error \ 'API response is empty, it might be due to excessive '
from the API provider.') 'input length or an internal server error '
'from your API provider.')
num_retries += 1
# Continue to retry instead of returning empty response
continue
return responses.choices[0].message.content return responses.choices[0].message.content
except (BadRequestError, APIStatusError) as e: except (BadRequestError, APIStatusError) as e:

View File

@ -11,12 +11,10 @@ faiss_gpu==1.7.2
-e git+https://github.com/open-compass/human-eval.git#egg=human-eval -e git+https://github.com/open-compass/human-eval.git#egg=human-eval
# IFEval # IFEval
langdetect langdetect
# TheoremQA
latex2sympy2==1.9.1
# Lawbench, leval # Lawbench, leval
ltp ltp
# Math # Math
math-verify math-verify[antlr4_11_0]
# Taco, apps Dataset # Taco, apps Dataset
pyext pyext
# Law Bench # Law Bench