Merge branch 'main' of github.com:open-compass/opencompass into olymmath

2025-05-30 16:03:24 +08:00 · 2025-04-02 09:17:52 +00:00 · 2025-04-02 09:17:52 +00:00 · 4601839ba8
commit 4601839ba8
parent 55d6941ed9 97236c8e97
15 changed files with 212 additions and 108 deletions
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
        lcb_test_output_pass@1: 18.75
        bbh-logical_deduction_seven_objects_score: 50
        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 72.6
+        mmlu-other_accuracy: 72.6
-        cmmlu-china-specific_naive_average: 76.25
+        cmmlu-china-specific_accuracy: 76.25
        mmlu_pro_math_accuracy: 25
        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
@ -101,8 +101,8 @@ internlm2_5-7b-chat-turbomind_fullbench:
        lcb_test_output_pass@1: 25.00
        bbh-logical_deduction_seven_objects_score: 50.00
        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 69.71
+        mmlu-other_accuracy: 69.71
-        cmmlu-china-specific_naive_average: 75.83
+        cmmlu-china-specific_accuracy: 75.83
        mmlu_pro_math_accuracy: 31.25
        ds1000_Pandas_accuracy: 0
        ds1000_Numpy_accuracy: 0
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
-        mmlu-stem_naive_average: 63.72
+        mmlu-stem_accuracy: 63.72
-        mmlu-social-science_naive_average: 80.15
+        mmlu-social-science_accuracy: 80.15
-        mmlu-humanities_naive_average: 74.27
+        mmlu-humanities_accuracy: 74.27
-        mmlu-other_naive_average: 71.85
+        mmlu-other_accuracy: 71.85
-        cmmlu-stem_naive_average: 67.07
+        cmmlu-stem_accuracy: 67.07
-        cmmlu-social-science_naive_average: 81.49
+        cmmlu-social-science_accuracy: 81.49
-        cmmlu-humanities_naive_average: 85.84
+        cmmlu-humanities_accuracy: 85.84
-        cmmlu-other_naive_average: 82.69
+        cmmlu-other_accuracy: 82.69
-        cmmlu-china-specific_naive_average: 79.88
+        cmmlu-china-specific_accuracy: 79.88
        mmlu_pro_biology_accuracy: 58.58
        mmlu_pro_business_accuracy: 28.01
        mmlu_pro_chemistry_accuracy: 22.79
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
        longbench_naive_average: 46.19
        longbench_zh_naive_average: 49.3
        longbench_en_naive_average: 43.97
-        longbench_single-document-qa_naive_average: 42.84
+        longbench_single-document-qa_score: 42.84
-        longbench_multi-document-qa_naive_average: 37.29
+        longbench_multi-document-qa_score: 41.25
-        longbench_summarization_naive_average: 23.21
+        longbench_summarization_score: 23.21
-        longbench_few-shot-learning_naive_average: 61.67
+        longbench_few-shot-learning_score: 61.67
-        longbench_synthetic-tasks_naive_average: 60.05
+        longbench_synthetic-tasks_score: 60.05
-        longbench_code-completion_naive_average: 52.09
+        longbench_code-completion_score: 52.09
 internlm2_5-7b-chat-turbomind:
    objective:
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
        teval_naive_average: 80
        SciCode_sub_accuracy: 5.56
        qa_dingo_cn_score: 99.01
-        mmlu-stem_naive_average: 68.2
+        mmlu-stem_accuracy: 68.2
-        mmlu-social-science_naive_average: 75.8
+        mmlu-social-science_accuracy: 75.8
-        mmlu-humanities_naive_average: 69.3
+        mmlu-humanities_accuracy: 69.3
-        mmlu-other_naive_average: 71.3
+        mmlu-other_accuracy: 71.3
-        cmmlu-stem_naive_average: 66.64
+        cmmlu-stem_accuracy: 66.64
-        cmmlu-social-science_naive_average: 76
+        cmmlu-social-science_accuracy: 76
-        cmmlu-humanities_naive_average: 77.9
+        cmmlu-humanities_accuracy: 77.9
-        cmmlu-other_naive_average: 77.25
+        cmmlu-other_accuracy: 77.25
-        cmmlu-china-specific_naive_average: 73.6
+        cmmlu-china-specific_accuracy: 73.6
        mmlu_pro_biology_accuracy: 66.67
        mmlu_pro_business_accuracy: 47.91
        mmlu_pro_chemistry_accuracy: 35
@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind:
        babilong_32k_naive_average: 48.9
        babilong_128k_naive_average: 40.8
        babilong_256k_naive_average: 23.5
-        longbench_single-document-qa_naive_average: 43.56
+        longbench_single-document-qa_score: 43.56
-        longbench_multi-document-qa_naive_average: 46.24
+        longbench_multi-document-qa_score: 46.24
-        longbench_summarization_naive_average: 24.32
+        longbench_summarization_score: 24.32
-        longbench_few-shot-learning_naive_average: 51.67
+        longbench_few-shot-learning_score: 51.67
-        longbench_synthetic-tasks_naive_average: 66.83
+        longbench_synthetic-tasks_score: 66.83
-        longbench_code-completion_naive_average: 45.99
+        longbench_code-completion_score: 45.99
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -157,7 +157,9 @@ jobs:
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -45,7 +45,7 @@ jobs:
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: conda env
        run: |
--- a/README.md
+++ b/README.md
@ -176,69 +176,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
- Your first evaluation with OpenCompass!
+### Your first evaluation with OpenCompass!
-  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
+OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
-  ```bash
+```bash
-  # CLI
+# CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_chat_demo.py
+opencompass examples/eval_chat_demo.py
-  ```
+```
-  You can find more script examples under [examples](./examples) folder.
+You can find more script examples under [examples](./examples) folder.
- API evaluation
+### API evaluation
-  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
-  ```bash
+```bash
-  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
-  # CLI
+# CLI
-  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_api_demo.py
+opencompass examples/eval_api_demo.py
-  # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
+# You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
-  ```
+```
- Accelerated Evaluation
+### Accelerated Evaluation
-  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
-  ```bash
+```bash
-  # CLI
+# CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_lmdeploy_demo.py
+opencompass examples/eval_lmdeploy_demo.py
-  ```
+```
- Supported Models
+### Supported Models and Datasets
-  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
-  ```bash
+```bash
-  # List all configurations
+# List all configurations
-  python tools/list_configs.py
+python tools/list_configs.py
-  # List all configurations related to llama and mmlu
+# List all configurations related to llama and mmlu
-  python tools/list_configs.py llama mmlu
+python tools/list_configs.py llama mmlu
-  ```
+```
-  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+#### Supported Models
-  ```bash
+If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```
-  If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+```bash
 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
 ```
-  ```bash
+#### Supported Datasets
-  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
+
-  ```
+Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
 ```bash
 # Recommended Evaluation Config based on Rules
 opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
 # Recommended Evaluation Config based on LLM Judge
 opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
 ```
 If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
 ```
 > \[!TIP\]
 >
@ -288,7 +302,7 @@ You can quickly find the dataset you need from the list through sorting, filteri
 In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
-Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
 <p align="right"><a href="#top">🔝Back to top</a></p>
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -208,9 +208,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
  ```
-  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+- ### 支持的模型与数据集
- ### 支持的模型
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
  ```bash
  # 列出所有配置
@ -219,13 +219,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  python tools/list_configs.py llama mmlu
  ```
-  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+  #### 支持的模型
  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
  ```bash
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```
-  如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
+  #### 支持的数据集
  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
  ```bash
  # 基于规则的推荐配置
  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
  # 基于LLM Judge的推荐配置
  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
  ```
  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
  ```bash
  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@ -281,9 +295,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
-另外，我们为每个数据集都提供了一种推荐配置，部分数据集还支持了基于LLM Judge的配置。
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
 详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
 <p align="right"><a href="#top">🔝返回顶部</a></p>
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -121,7 +121,7 @@
    category: Reasoning
    paper: https://arxiv.org/pdf/2310.16049
    configpath: opencompass/configs/datasets/musr/musr_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
 - needlebench:
    name: NeedleBench
    category: Long Context
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@ -32,12 +32,23 @@ with open(load_path, 'r') as f2:
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
    'mmlu_pro', 'musr'
 ]
 def table_format(data_list):
    table_format_list = []
    for i in data_list:
        table_format_list_sub = []
        for j in i:
            if j in recommanded_dataset_list:
                link_token = '[link]('
            else:
                link_token = '[link(TBD)]('
            for index in HEADER:
                if index == 'paper':
                    table_format_list_sub.append('[link](' + i[j][index] + ')')
@ -45,18 +56,18 @@ def table_format(data_list):
                    if i[j][index] == '':
                        table_format_list_sub.append(i[j][index])
                    else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                elif index == 'configpath':
                    if isinstance(i[j][index], list):
                        sub_list_text = ''
                        for k in i[j][index]:
-                            sub_list_text += ('[link](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                              ') / ')
                        table_format_list_sub.append(sub_list_text[:-2])
                    else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                else:
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@ -30,12 +30,23 @@ with open(load_path, 'r') as f2:
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
    'mmlu_pro', 'musr'
 ]
 def table_format(data_list):
    table_format_list = []
    for i in data_list:
        table_format_list_sub = []
        for j in i:
            if j in recommanded_dataset_list:
                link_token = '[链接]('
            else:
                link_token = '[链接(TBD)]('
            for index in HEADER:
                if index == 'paper':
                    table_format_list_sub.append('[链接](' + i[j][index] + ')')
@ -43,17 +54,19 @@ def table_format(data_list):
                    if i[j][index] == '':
                        table_format_list_sub.append(i[j][index])
                    else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                elif index == 'configpath':
                    if isinstance(i[j][index], list):
                        sub_list_text = ''
                        for k in i[j][index]:
-                            sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                              ') / ')
                        table_format_list_sub.append(sub_list_text[:-2])
                    else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                else:
                    table_format_list_sub.append(i[j][index])
--- a/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py
+++ b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModel
 models = [
    dict(
        type=TurboMindModel,
        abbr='internvl2_5-38b-turbomind',
        path='OpenGVLab/InternVL2_5-38B',
        engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
        max_seq_len=8192,
        max_out_len=8192,
        batch_size=8,
        run_cfg=dict(num_gpus=4),
    )
 ]
--- a/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py
+++ b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py
@ -0,0 +1,15 @@
 from opencompass.models import TurboMindModel
 models = [
    dict(
        type=TurboMindModel,
        abbr='internvl2_5-8b-turbomind',
        path='OpenGVLab/InternVL2_5-8B',
        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
        max_seq_len=8192,
        max_out_len=8192,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/opencompass/datasets/TheoremQA/number_utils.py
+++ b/opencompass/datasets/TheoremQA/number_utils.py
@ -48,7 +48,7 @@ def clean_units(pred_str: str):
 def number_it(num):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
    if isinstance(num, (int, float)):
        return num
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@ -17,7 +17,7 @@ def time_limit(seconds: float):
 def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
    if any([option in pred.lower() for option in ['yes', 'true']]):
        pred = 'True'
--- a/opencompass/models/claude_sdk_api.py
+++ b/opencompass/models/claude_sdk_api.py
@ -33,6 +33,7 @@ class ClaudeSDK(BaseAPIModel):
        max_seq_len: int = 2048,
        meta_template: Optional[Dict] = None,
        temperature: Optional[float] = 0.0,
        thinking: Optional[Dict] = None,
        retry: int = 2,
    ):
        super().__init__(path=path,
@ -49,6 +50,7 @@ class ClaudeSDK(BaseAPIModel):
        self.anthropic = Anthropic(api_key=key)
        self.model = path
        self.temperature = temperature
        self.thinking = thinking
    def generate(
        self,
@ -108,11 +110,26 @@ class ClaudeSDK(BaseAPIModel):
        while num_retries < self.retry:
            self.wait()
            try:
-                responses = self.anthropic.messages.create(
+                api_params = {
-                    model=self.model,
+                    'model': self.model,
-                    max_tokens=max_out_len,
+                    'max_tokens': max_out_len,
-                    temperature=self.temperature,
+                    'temperature': self.temperature,
-                    messages=messages)
+                    'messages': messages,
                }
                if self.thinking is not None:
                    api_params['thinking'] = self.thinking
                    api_params['stream'] = True
                responses = self.anthropic.messages.create(**api_params)
                # Handle new response format
                for content in responses.content:
                    if content.type == 'text':
                        return content.text
                # If no text type content is found, return the first
                # content (backward compatibility)
                return responses.content[0].text
            except Exception as e:
                self.logger.error(e)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -652,7 +652,6 @@ class OpenAISDK(OpenAI):
                    self.logger.info('Start calling OpenAI API')
                responses = self.openai_client.chat.completions.create(
                    **query_data, timeout=timeout)  # timeout in seconds
                if self.verbose:
                    self.logger.info(
                        'Successfully get response from OpenAI API')
@ -660,10 +659,18 @@ class OpenAISDK(OpenAI):
                        self.logger.info(responses)
                    except Exception:
                        pass  # noqa F841
-                if not responses.choices:
+
                # Check if response is empty or content is empty
                if not responses.choices or not responses.choices[
                        0].message.content:
                    self.logger.error(
-                        'Response is empty, it is an internal server error \
+                        'API response is empty, it might be due to excessive '
-                            from the API provider.')
+                        'input length or an internal server error '
                        'from your API provider.')
                    num_retries += 1
                    # Continue to retry instead of returning empty response
                    continue
                return responses.choices[0].message.content
            except (BadRequestError, APIStatusError) as e:
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -11,12 +11,10 @@ faiss_gpu==1.7.2
 -e git+https://github.com/open-compass/human-eval.git#egg=human-eval
 # IFEval
 langdetect
 # TheoremQA
 latex2sympy2==1.9.1
 # Lawbench, leval
 ltp
 # Math
-math-verify
+math-verify[antlr4_11_0]
 # Taco, apps Dataset
 pyext
 # Law Bench