From 62dbf047086fe9c9ee950a06168f88283ebadcee Mon Sep 17 00:00:00 2001
From: Fengzhe Zhou <zfz-960727@163.com>
Date: Tue, 14 May 2024 22:42:23 +0800
Subject: [PATCH] [Sync] update github workflow (#1156)

---
 .github/scripts/pr_oc_score_assert.py         |   4 +-
 .github/workflows/daily-run-test.yml          |   5 +-
 .github/workflows/pr-run-test.yml             |  12 +-
 .../{bbh_gen_0a5495.py => bbh_gen_98fba6.py}  |   5 +-
 configs/datasets/collections/base_core.py     |   4 +-
 configs/datasets/gsm8k/gsm8k_gen_17d0dc.py    |  39 ++++
 .../math/math_4shot_base_gen_db136b.py        |  30 +++
 ...math_4shot_example_from_google_research.py |  40 ++++
 ...2049.py => math_evaluatorv2_gen_2f4a71.py} |   2 +-
 .../mbpp/sanitized_mbpp_gen_742f0c.py         |  82 ++++++++
 .../mbpp/sanitized_mbpp_gen_a0fc46.py         |  41 ++++
 configs/models/deepseek/hf_deepseek_v2.py     |  18 ++
 .../models/deepseek/hf_deepseek_v2_chat.py    |  18 ++
 .../deepseek/lmdeploy_deepseek_series.py      |  23 +++
 .../hf_internlm/lmdeploy_internlm2_series.py  |  24 +++
 .../models/hf_llama/lmdeploy_llama_series.py  |  30 +++
 .../models/mistral/lmdeploy_mistral_series.py |  24 +++
 configs/models/qwen/hf_qwen1_5_110b.py        |  12 ++
 configs/models/qwen/hf_qwen1_5_110b_chat.py   |  12 ++
 .../models/qwen/lmdeploy_qwen1_5_series.py    |  29 +++
 configs/models/qwen/lmdeploy_qwen_series.py   |  25 +++
 configs/models/yi/lmdeploy_yi_series.py       |  23 +++
 .../summarizers/compassbench_v1_objective.py  |   2 +-
 configs/summarizers/groups/legacy/cibench.py  | 109 +++++++++++
 opencompass/cli/main.py                       |   5 +-
 opencompass/models/__init__.py                |   8 +-
 opencompass/models/ai360_api.py               |  44 +++--
 opencompass/models/baichuan_api.py            |   2 +-
 opencompass/models/baidu_api.py               |  35 +++-
 opencompass/models/deepseek_api.py            | 178 +++++++++++++++++
 opencompass/models/gemini_api.py              |  63 ------
 opencompass/models/huggingface.py             |  11 +-
 opencompass/models/minimax_api.py             | 170 ++++++++++++++++
 opencompass/models/qwen_api.py                |   3 +-
 opencompass/models/stepfun_api.py             | 182 ++++++++++++++++++
 opencompass/models/turbomind.py               |   7 +-
 opencompass/models/xunfei_api.py              | 149 ++++++++++++++
 opencompass/runners/dlc.py                    |  13 +-
 opencompass/runners/local.py                  |  18 +-
 opencompass/utils/text_postprocessors.py      |  12 +-
 40 files changed, 1377 insertions(+), 136 deletions(-)
 rename configs/datasets/bbh/{bbh_gen_0a5495.py => bbh_gen_98fba6.py} (94%)
 create mode 100644 configs/datasets/gsm8k/gsm8k_gen_17d0dc.py
 create mode 100644 configs/datasets/math/math_4shot_base_gen_db136b.py
 create mode 100644 configs/datasets/math/math_4shot_example_from_google_research.py
 rename configs/datasets/math/{math_evaluatorv2_gen_9d2049.py => math_evaluatorv2_gen_2f4a71.py} (96%)
 create mode 100644 configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py
 create mode 100644 configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py
 create mode 100644 configs/models/deepseek/hf_deepseek_v2.py
 create mode 100644 configs/models/deepseek/hf_deepseek_v2_chat.py
 create mode 100644 configs/models/deepseek/lmdeploy_deepseek_series.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_series.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama_series.py
 create mode 100644 configs/models/mistral/lmdeploy_mistral_series.py
 create mode 100644 configs/models/qwen/hf_qwen1_5_110b.py
 create mode 100644 configs/models/qwen/hf_qwen1_5_110b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_series.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_series.py
 create mode 100644 configs/models/yi/lmdeploy_yi_series.py
 create mode 100644 configs/summarizers/groups/legacy/cibench.py
 create mode 100644 opencompass/models/deepseek_api.py
 create mode 100644 opencompass/models/stepfun_api.py

diff --git a/.github/scripts/pr_oc_score_assert.py b/.github/scripts/pr_oc_score_assert.py
index 5c4bb85b..6ac8750c 100644
--- a/.github/scripts/pr_oc_score_assert.py
+++ b/.github/scripts/pr_oc_score_assert.py
@@ -4,7 +4,7 @@ import os
 import pytest
 
 output_path = 'regression_result'
-model = 'internlm-chat-7b-hf'
+model = 'internlm2-chat-7b-hf'
 dataset = 'siqa'
 
 
@@ -22,7 +22,7 @@ class TestChatScore:
 
     def test_model_dataset_score(self, result_scores):
         result_score = result_scores.get(model).get(dataset)
-        assert_score(result_score, 73.59)
+        assert_score(result_score, 79.53)
 
 
 def assert_score(score, baseline):
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 922bf433..1b887b23 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -14,6 +14,9 @@ env:
   PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
   USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
   HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HF_DATASETS_OFFLINE: 1
+  TRANSFORMERS_OFFLINE: 1
+  HF_HUB_OFFLINE: 1
 
 jobs:
   daily_run_test:
@@ -42,7 +45,7 @@ jobs:
           cp -r ${{env.USERSPACE_PREFIX}}/data .
           rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
           ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
-          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
+          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_HUB_OFFLINE=1;
       - name:  Run test
         run: |
           eval "$(conda shell.bash hook)"
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
index 7ada6945..a754c4aa 100644
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@@ -21,6 +21,9 @@ env:
   CONDA_ENV: opencompass_base
   USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
   HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HF_DATASETS_OFFLINE: 1
+  TRANSFORMERS_OFFLINE: 1
+  HF_HUB_OFFLINE: 1
 
 jobs:
   pr_run_test:
@@ -42,21 +45,20 @@ jobs:
           cp -r ${{env.USERSPACE_PREFIX}}/data .
           rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
           ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
-          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
       - name:  Run test
         run: |
           eval "$(conda shell.bash hook)"
           conda activate ${{env.CONDA_ENV}}
           conda info --envs
           rm -rf regression_result
-          python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
+          python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
       - name:  Get result
         run: |
           score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
-          if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
-             echo "score is $score between 70 and 75"
+          if (( ${score%.*} >= 75 && ${score%.*} <= 85 )); then
+             echo "score is $score between 75 and 85"
           else
-             echo "score is $score not between 70 and 75"
+             echo "score is $score not between 75 and 85"
              exit 1
           fi
           rm -rf regression_result
diff --git a/configs/datasets/bbh/bbh_gen_0a5495.py b/configs/datasets/bbh/bbh_gen_98fba6.py
similarity index 94%
rename from configs/datasets/bbh/bbh_gen_0a5495.py
rename to configs/datasets/bbh/bbh_gen_98fba6.py
index 6aebc233..78edd95b 100644
--- a/configs/datasets/bbh/bbh_gen_0a5495.py
+++ b/configs/datasets/bbh/bbh_gen_98fba6.py
@@ -49,7 +49,7 @@ for _name in bbh_multiple_choice_sets:
             template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=512))
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
     bbh_eval_cfg = dict(
         evaluator=dict(type=BBHEvaluator_mcq),
         pred_role='BOT',
@@ -66,6 +66,7 @@ for _name in bbh_multiple_choice_sets:
             infer_cfg=bbh_infer_cfg.copy(),
             eval_cfg=bbh_eval_cfg.copy()))
 
+
 for _name in bbh_free_form_sets:
     with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
         _hint = f.read()
@@ -75,7 +76,7 @@ for _name in bbh_free_form_sets:
             template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=512))
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
     bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
 
     bbh_datasets.append(
diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py
index 6b677ee1..9df244f2 100644
--- a/configs/datasets/collections/base_core.py
+++ b/configs/datasets/collections/base_core.py
@@ -10,9 +10,9 @@ with read_base():
     from ..race.race_ppl_abed12 import race_datasets
     from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets
     from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
-    from ..bbh.bbh_gen_0a5495 import bbh_datasets
+    from ..bbh.bbh_gen_98fba6 import bbh_datasets
     from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
-    from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
+    from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
     from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
     from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
     from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
diff --git a/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py
new file mode 100644
index 00000000..fd448cc8
--- /dev/null
+++ b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py
@@ -0,0 +1,39 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt='Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n'),
+                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
+                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
+                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt='For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n'),
+                dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Question']))
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
+                      pred_postprocessor=dict(type=gsm8k_postprocess),
+                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='./data/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
diff --git a/configs/datasets/math/math_4shot_base_gen_db136b.py b/configs/datasets/math/math_4shot_base_gen_db136b.py
new file mode 100644
index 00000000..16883f37
--- /dev/null
+++ b/configs/datasets/math/math_4shot_base_gen_db136b.py
@@ -0,0 +1,30 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
+
+with read_base():
+    from .math_4shot_example_from_google_research import prompt
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem']))
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
diff --git a/configs/datasets/math/math_4shot_example_from_google_research.py b/configs/datasets/math/math_4shot_example_from_google_research.py
new file mode 100644
index 00000000..80feee44
--- /dev/null
+++ b/configs/datasets/math/math_4shot_example_from_google_research.py
@@ -0,0 +1,40 @@
+# Solving Quantitative Reasoning Problems with Language Models
+
+prompt = '''
+Problem:
+Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.
+
+Solution:
+The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.
+Final Answer: The final answer is $[2,5)$. I hope it is correct.
+
+Problem:
+If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
+
+Solution:
+We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$
+Final Answer: The final answer is $24$. I hope it is correct.
+
+Problem:
+Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
+
+Solution:
+If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
+\\begin{align*}
+30n&=480\\
+\\Rightarrow\\qquad n&=480/30=\\boxed{16}
+\\end{align*}
+Final Answer: The final answer is $16$. I hope it is correct.
+
+Problem:
+If the system of equations
+\\begin{align*}
+6x-4y&=a,\\
+6y-9x &=b.
+\\end{align*}
+has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.
+
+Solution:
+If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$
+Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.
+'''.strip()
diff --git a/configs/datasets/math/math_evaluatorv2_gen_9d2049.py b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py
similarity index 96%
rename from configs/datasets/math/math_evaluatorv2_gen_9d2049.py
rename to configs/datasets/math/math_evaluatorv2_gen_2f4a71.py
index e777e1e3..ca9b9b90 100644
--- a/configs/datasets/math/math_evaluatorv2_gen_9d2049.py
+++ b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py
@@ -38,7 +38,7 @@ Problem:
 Solution:"""
     ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Problem']))
 
 # postprocess v2
 math_eval_cfg = dict(
diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py
new file mode 100644
index 00000000..5ed9f457
--- /dev/null
+++ b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py
@@ -0,0 +1,82 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
+
+sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+prompt = '''
+You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:
+
+assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
+assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)
+
+[BEGIN]
+ '\
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return (res)\
+'
+[DONE]
+
+
+You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:
+
+assert is_not_prime(2) == False
+assert is_not_prime(10) == True
+assert is_not_prime(35) == True
+
+[BEGIN]
+ '\
+import math
+def is_not_prime(n):
+    result = False
+    for i in range(2,int(math.sqrt(n)) + 1):
+        if n % i == 0:
+            result = True
+    return result\
+'
+[DONE]
+
+
+You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:
+
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]
+
+[BEGIN]
+ '\
+import heapq as hq
+def heap_queue_largest(nums,n):
+    largest_nums = hq.nlargest(n, nums)
+    return largest_nums\
+'
+[DONE]
+
+
+You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:
+
+{test_list}
+
+'''.strip()
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=prompt),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+sanitized_mbpp_datasets = [
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr='sanitized_mbpp',
+        path='./data/mbpp/sanitized-mbpp.jsonl',
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+]
diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py
new file mode 100644
index 00000000..48c12835
--- /dev/null
+++ b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py
@@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
+
+sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n    res = tuple(set(test_tup1) & set(test_tup2))\n    return (res)' \n[DONE]\n\n",),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\ndef is_not_prime(n):\n    result = False\n    for i in range(2,int(math.sqrt(n)) + 1):\n        if n %% i == 0:\n            result = True\n    return result' \n[DONE]\n\n",),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums,n):\n    largest_nums = hq.nlargest(n, nums)\n    return largest_nums' \n[DONE]\n\n",),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+sanitized_mbpp_datasets = [
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr='sanitized_mbpp',
+        path='./data/mbpp/sanitized-mbpp.jsonl',
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+]
diff --git a/configs/models/deepseek/hf_deepseek_v2.py b/configs/models/deepseek/hf_deepseek_v2.py
new file mode 100644
index 00000000..e05be313
--- /dev/null
+++ b/configs/models/deepseek/hf_deepseek_v2.py
@@ -0,0 +1,18 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='deepseek-v2-hf',
+        path='deepseek-ai/DeepSeek-V2',
+        max_out_len=1024,
+        batch_size=4,
+        model_kwargs=dict(
+            device_map='sequential',
+            torch_dtype='torch.bfloat16',
+            max_memory={i: '75GB' for i in range(8)},
+            attn_implementation='eager'
+        ),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/deepseek/hf_deepseek_v2_chat.py b/configs/models/deepseek/hf_deepseek_v2_chat.py
new file mode 100644
index 00000000..67dfd0bd
--- /dev/null
+++ b/configs/models/deepseek/hf_deepseek_v2_chat.py
@@ -0,0 +1,18 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-v2-chat-hf',
+        path='deepseek-ai/DeepSeek-V2-Chat',
+        max_out_len=1024,
+        batch_size=4,
+        model_kwargs=dict(
+            device_map='sequential',
+            torch_dtype='torch.bfloat16',
+            max_memory={i: '75GB' for i in range(8)},
+            attn_implementation='eager'
+        ),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/deepseek/lmdeploy_deepseek_series.py b/configs/models/deepseek/lmdeploy_deepseek_series.py
new file mode 100644
index 00000000..5060b0c0
--- /dev/null
+++ b/configs/models/deepseek/lmdeploy_deepseek_series.py
@@ -0,0 +1,23 @@
+from opencompass.models import LmdeployPytorchModel
+
+settings = [
+    ('deepseek-7b-base-hf', 'deepseek-ai/deepseek-llm-7b-base', 1),
+    ('deepseek-67b-base-hf', 'deepseek-ai/deepseek-llm-67b-base', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=LmdeployPytorchModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_series.py b/configs/models/hf_internlm/lmdeploy_internlm2_series.py
new file mode 100644
index 00000000..f01ee714
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_series.py
@@ -0,0 +1,24 @@
+from opencompass.models import TurboMindModel
+
+settings = [
+    ('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
+    ('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
+    ('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=TurboMindModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/hf_llama/lmdeploy_llama_series.py b/configs/models/hf_llama/lmdeploy_llama_series.py
new file mode 100644
index 00000000..346d920d
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama_series.py
@@ -0,0 +1,30 @@
+from opencompass.models import TurboMindModel
+
+settings = [
+    ('llama-7b-turbomind', 'huggyllama/llama-7b', 1),
+    ('llama-13b-turbomind', 'huggyllama/llama-13b', 1),
+    ('llama-30b-turbomind', 'huggyllama/llama-30b', 2),
+    ('llama-65b-turbomind', 'huggyllama/llama-65b', 4),
+    ('llama-2-7b-turbomind', 'meta-llama/Llama-2-7b-hf', 1),
+    ('llama-2-13b-turbomind', 'meta-llama/Llama-2-13b-hf', 1),
+    ('llama-2-70b-turbomind', 'meta-llama/Llama-2-70b-hf', 4),
+    ('llama-3-8b-turbomind', 'meta-llama/Meta-Llama-3-8B', 1),
+    ('llama-3-70b-turbomind', 'meta-llama/Meta-Llama-3-70B', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=TurboMindModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/mistral/lmdeploy_mistral_series.py b/configs/models/mistral/lmdeploy_mistral_series.py
new file mode 100644
index 00000000..0bb07c52
--- /dev/null
+++ b/configs/models/mistral/lmdeploy_mistral_series.py
@@ -0,0 +1,24 @@
+from opencompass.models import LmdeployPytorchModel
+
+settings = [
+    ('mistral-7b-v0.1-pytorch', 'mistralai/Mistral-7B-v0.1', 1),
+    ('mixtral-8x7b-v0.1-pytorch', 'mistralai/Mixtral-8x7B-v0.1', 2),
+    ('mixtral-8x22b-v0.1-pytorch', 'mistralai/Mixtral-8x22B-v0.1', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=LmdeployPytorchModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/qwen/hf_qwen1_5_110b.py b/configs/models/qwen/hf_qwen1_5_110b.py
new file mode 100644
index 00000000..1ba10658
--- /dev/null
+++ b/configs/models/qwen/hf_qwen1_5_110b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen1.5-110b-hf',
+        path='Qwen/Qwen1.5-110B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/hf_qwen1_5_110b_chat.py b/configs/models/qwen/hf_qwen1_5_110b_chat.py
new file mode 100644
index 00000000..e77bad42
--- /dev/null
+++ b/configs/models/qwen/hf_qwen1_5_110b_chat.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='qwen1.5-110b-chat-hf',
+        path='Qwen/Qwen1.5-110B-Chat',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_series.py b/configs/models/qwen/lmdeploy_qwen1_5_series.py
new file mode 100644
index 00000000..f3aef87f
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_series.py
@@ -0,0 +1,29 @@
+from opencompass.models import LmdeployPytorchModel
+
+settings = [
+    ('qwen1.5-0.5b-pytorch', 'Qwen/Qwen1.5-0.5B', 1),
+    ('qwen1.5-1.8b-pytorch', 'Qwen/Qwen1.5-1.8B', 1),
+    ('qwen1.5-4b-pytorch', 'Qwen/Qwen1.5-4B', 1),
+    ('qwen1.5-7b-pytorch', 'Qwen/Qwen1.5-7B', 1),
+    ('qwen1.5-14b-pytorch', 'Qwen/Qwen1.5-14B', 1),
+    ('qwen1.5-32b-pytorch', 'Qwen/Qwen1.5-32B', 2),
+    ('qwen1.5-72b-pytorch', 'Qwen/Qwen1.5-72B', 4),
+    ('qwen1.5-moe-a2.7b-pytorch', 'Qwen/Qwen1.5-MoE-A2.7B', 1),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=LmdeployPytorchModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/qwen/lmdeploy_qwen_series.py b/configs/models/qwen/lmdeploy_qwen_series.py
new file mode 100644
index 00000000..12b16820
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_series.py
@@ -0,0 +1,25 @@
+from opencompass.models import TurboMindModel
+
+settings = [
+    ('qwen-1.8b-turbomind', 'Qwen/Qwen-1_8B', 1),
+    ('qwen-7b-turbomind', 'Qwen/Qwen-7B', 1),
+    ('qwen-14b-turbomind', 'Qwen/Qwen-14B', 1),
+    ('qwen-72b-turbomind', 'Qwen/Qwen-72B', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=TurboMindModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/models/yi/lmdeploy_yi_series.py b/configs/models/yi/lmdeploy_yi_series.py
new file mode 100644
index 00000000..f19476b7
--- /dev/null
+++ b/configs/models/yi/lmdeploy_yi_series.py
@@ -0,0 +1,23 @@
+from opencompass.models import LmdeployPytorchModel
+
+settings = [
+    ('yi-6b-pytorch', '01-ai/Yi-6B', 1),
+    ('yi-34b-pytorch', '01-ai/Yi-34B', 2),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=LmdeployPytorchModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=2048,
+            batch_size=16,
+            concurrency=16,
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
diff --git a/configs/summarizers/compassbench_v1_objective.py b/configs/summarizers/compassbench_v1_objective.py
index 79f8ec28..2d677d0a 100644
--- a/configs/summarizers/compassbench_v1_objective.py
+++ b/configs/summarizers/compassbench_v1_objective.py
@@ -2,7 +2,7 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .groups.cibench import cibench_summary_groups
+    from .groups.legacy.cibench import cibench_summary_groups
     from .groups.plugineval import plugineval_summary_groups
 
 
diff --git a/configs/summarizers/groups/legacy/cibench.py b/configs/summarizers/groups/legacy/cibench.py
new file mode 100644
index 00000000..bc2ab94c
--- /dev/null
+++ b/configs/summarizers/groups/legacy/cibench.py
@@ -0,0 +1,109 @@
+
+_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
+_cibench = ['cibench_' + i for i in _cibench]
+cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
+
+_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
+    'scipy', 'seaborn', 'sklearn', 'tensorflow']
+_cibench_template = ['cibench_template/' + i for i in _cibench_template]
+# number of total exec questions in this module
+_cibench_template_weight = {
+    'lightgbm': [30, 15, 0, 0],
+    'matplotlib': [42, 0, 0, 36],
+    'nltk': [70, 30, 20, 10],
+    'opencv': [60, 10, 0, 40],
+    'pandas': [60, 40, 0, 10],
+    'pytorch': [28, 0, 0, 0],
+    'scipy': [60, 40, 0, 0],
+    'seaborn': [42, 0, 0, 35],
+    'sklearn': [42, 6, 0, 18],
+    'tensorflow': [36, 6, 0, 12],
+}
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:text_score',
+        'subsets': [[i, 'text_score'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
+    },
+])
+
+
+## chinese
+_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
+    'scipy', 'seaborn', 'sklearn', 'tensorflow']
+_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn]
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_cn:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:text_score',
+        'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
+    },
+])
+
+
+## add more without nltk
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_wo_nltk:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_wo_nltk:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_wo_nltk:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+])
+
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_cn_wo_nltk:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_cn_wo_nltk:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_cn_wo_nltk:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+])
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index e9b5abb3..c2682ab2 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -170,6 +170,8 @@ def parse_dlc_args(dlc_parser):
                             type=str)
 
 
+
+
 def parse_hf_args(hf_parser):
     """These args are all for the quick construction of HuggingFace models."""
     hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
@@ -212,7 +214,7 @@ def main():
     if args.work_dir is not None:
         cfg['work_dir'] = args.work_dir
     else:
-        cfg.setdefault('work_dir', osp.join('outputs', 'default'))
+        cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
 
     # cfg_time_str defaults to the current time
     cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -340,5 +342,6 @@ def main():
         summarizer.summarize(time_str=cfg_time_str)
 
 
+
 if __name__ == '__main__':
     main()
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index fab0824b..d7f7c063 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -7,7 +7,8 @@ from .base import BaseModel, LMTemplateParser  # noqa: F401
 from .base_api import APITemplateParser, BaseAPIModel  # noqa: F401
 from .bytedance_api import ByteDance  # noqa: F401
 from .claude_api import Claude  # noqa: F401
-from .gemini_api import Gemini, GeminiAllesAPIN  # noqa: F401
+from .deepseek_api import DeepseekAPI  # noqa: F401
+from .gemini_api import Gemini  # noqa: F401
 from .glm import GLM130B  # noqa: F401
 from .huggingface import HuggingFace  # noqa: F401
 from .huggingface import HuggingFaceCausalLM  # noqa: F401
@@ -21,7 +22,7 @@ from .lightllm_api import LightllmAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401
 from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
 from .lmdeploy_tis import LmdeployTisModel  # noqa: F401
-from .minimax_api import MiniMax  # noqa: F401
+from .minimax_api import MiniMax, MiniMaxChatCompletionV2  # noqa: F401
 from .mistral_api import Mistral  # noqa: F401
 from .mixtral import Mixtral  # noqa: F401
 from .modelscope import ModelScope, ModelScopeCausalLM  # noqa: F401
@@ -31,11 +32,12 @@ from .openai_api import OpenAI  # noqa: F401
 from .pangu_api import PanGu  # noqa: F401
 from .qwen_api import Qwen  # noqa: F401
 from .sensetime_api import SenseTime  # noqa: F401
+from .stepfun_api import StepFun  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
 from .turbomind_tis import TurboMindTisModel  # noqa: F401
 from .unigpt_api import UniGPT  # noqa: F401
 from .vllm import VLLM  # noqa: F401
-from .xunfei_api import XunFei  # noqa: F401
+from .xunfei_api import XunFei, XunFeiSpark  # noqa: F401
 from .yayi_api import Yayi  # noqa: F401
 from .zhipuai_api import ZhiPuAI  # noqa: F401
 from .zhipuai_v2_api import ZhiPuV2AI  # noqa: F401
diff --git a/opencompass/models/ai360_api.py b/opencompass/models/ai360_api.py
index 87c80564..223a6d79 100644
--- a/opencompass/models/ai360_api.py
+++ b/opencompass/models/ai360_api.py
@@ -1,4 +1,3 @@
-import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
@@ -141,29 +140,32 @@ class AI360GPT(BaseAPIModel):
                 self.wait()
                 continue
             if raw_response.status_code == 200:
-                try:
-                    msg = response['choices'][0]['message']['content'].strip()
-                    return msg
-
-                except KeyError:
-                    if 'error' in response:
-                        # tpm(token per minitue) limit
-                        if response['erro']['code'] == '1005':
-                            time.sleep(1)
-                            continue
-
-                        self.logger.error('Find error message in response: ',
-                                          str(response['error']))
+                msg = response['choices'][0]['message']['content'].strip()
+                self.logger.debug(f'Generated: {msg}')
+                return msg
 
             # sensitive content, prompt overlength, network error
             # or illegal prompt
-            if (raw_response.status_code == 400
-                    or raw_response.status_code == 401
-                    or raw_response.status_code == 402
-                    or raw_response.status_code == 429
-                    or raw_response.status_code == 500):
-                print(raw_response.text)
-                continue
+            if raw_response.status_code in [400, 401, 402, 429, 500]:
+                if 'error' not in response:
+                    print(raw_response.status_code)
+                    print(raw_response.text)
+                    continue
+                print(response)
+                # tpm(token per minitue) limit
+                if response['error']['code'] == '1005':
+                    self.logger.debug('tpm limit, ignoring')
+                    continue
+                elif response['error']['code'] == '1001':
+                    msg = '参数错误:messages参数过长或max_tokens参数值过大'
+                    self.logger.debug(f'Generated: {msg}')
+                    return msg
+                else:
+                    print(response)
+
+                self.logger.error('Find error message in response: ',
+                                  str(response['error']))
+
             print(raw_response)
             max_num_retries += 1
 
diff --git a/opencompass/models/baichuan_api.py b/opencompass/models/baichuan_api.py
index b4cc0dc2..9bca78e6 100644
--- a/opencompass/models/baichuan_api.py
+++ b/opencompass/models/baichuan_api.py
@@ -145,8 +145,8 @@ class BaiChuan(BaseAPIModel):
                 self.wait()
                 continue
             if raw_response.status_code == 200:
-
                 msg = response['choices'][0]['message']['content']
+                self.logger.debug(f'Generated: {msg}')
                 return msg
 
             if raw_response.status_code != 200:
diff --git a/opencompass/models/baidu_api.py b/opencompass/models/baidu_api.py
index ef20807e..51d81ff5 100644
--- a/opencompass/models/baidu_api.py
+++ b/opencompass/models/baidu_api.py
@@ -53,6 +53,8 @@ class ERNIEBot(BaseAPIModel):
         self.headers = {'Content_Type': 'application/json'}
         self.secretkey = secretkey
         self.key = key
+        if not url.endswith('?access_token='):
+            url += '?access_token='
         self.url = url
         access_token, _ = self._generate_access_token()
         self.access_token = access_token
@@ -143,14 +145,25 @@ class ERNIEBot(BaseAPIModel):
             messages = [{'role': 'user', 'content': input}]
         else:
             messages = []
+            msg_buffer, last_role = [], None
             for item in input:
-                msg = {'content': item['prompt']}
-                if item['role'] == 'HUMAN':
-                    msg['role'] = 'user'
-                elif item['role'] == 'BOT':
-                    msg['role'] = 'assistant'
+                if item['role'] == 'BOT':
+                    role = 'assistant'
+                else:  # USER or SYSTEM
+                    role = 'user'
+                if role != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = role
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
 
-                messages.append(msg)
         data = {'messages': messages}
         data.update(self.generation_kwargs)
 
@@ -181,6 +194,7 @@ class ERNIEBot(BaseAPIModel):
             if raw_response.status_code == 200:
                 try:
                     msg = response['result']
+                    self.logger.debug(msg)
                     return msg
                 except KeyError:
                     print(response)
@@ -188,9 +202,12 @@ class ERNIEBot(BaseAPIModel):
                     if response['error_code'] == 336007:
                         # exceed max length
                         return ''
-
-                    time.sleep(1)
-                    continue
+                    elif response['error_code'] == 336103:
+                        # prompt tokens too long
+                        return ''
+                    else:
+                        time.sleep(1)
+                        continue
 
             if (response['error_code'] == 110 or response['error_code'] == 100
                     or response['error_code'] == 111
diff --git a/opencompass/models/deepseek_api.py b/opencompass/models/deepseek_api.py
new file mode 100644
index 00000000..dba51937
--- /dev/null
+++ b/opencompass/models/deepseek_api.py
@@ -0,0 +1,178 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class DeepseekAPI(BaseAPIModel):
+    """Model wrapper around DeepseekAPI.
+
+    Documentation:
+
+    Args:
+        path (str): The name of DeepseekAPI model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        key: str,
+        url: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+        system_prompt: str = '',
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer ' + key,
+        }
+        self.url = url
+        self.model = path
+        self.system_prompt = system_prompt
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for item in input:
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
+
+        if self.system_prompt:
+            system = {'role': 'system', 'content': self.system_prompt}
+            messages.insert(0, system)
+
+        data = {'model': self.model, 'messages': messages}
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                headers=self.headers,
+                                                json=data)
+            except Exception as err:
+                print('Request Error:{}'.format(err))
+                time.sleep(2)
+                continue
+
+            try:
+                response = raw_response.json()
+            except Exception as err:
+                print('Response Error:{}'.format(err))
+                response = None
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+
+            if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
+                msg = response['choices'][0]['message']['content']
+                self.logger.debug(f'Generated: {msg}')
+                return msg
+
+            if raw_response.status_code == 401:
+                print('请求被拒绝 api_key错误')
+                continue
+            elif raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response)
diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py
index 5779d3d9..0e9b089f 100644
--- a/opencompass/models/gemini_api.py
+++ b/opencompass/models/gemini_api.py
@@ -186,66 +186,3 @@ class Gemini(BaseAPIModel):
             time.sleep(1)
 
         raise RuntimeError('API call failed.')
-
-
-class GeminiAllesAPIN(Gemini):
-    """Model wrapper around Gemini models.
-
-    Documentation:
-
-    Args:
-        path (str): The name of Gemini model.
-            e.g. `gemini-pro`
-        key (str): Authorization key.
-        query_per_second (int): The maximum queries allowed per second
-            between two consecutive calls of the API. Defaults to 1.
-        max_seq_len (int): Unused here.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-        retry (int): Number of retires if the API call fails. Defaults to 2.
-    """
-
-    def __init__(
-        self,
-        path: str,
-        key: str,
-        url: str,
-        query_per_second: int = 2,
-        max_seq_len: int = 2048,
-        meta_template: Optional[Dict] = None,
-        retry: int = 2,
-        temperature: float = 1.0,
-        top_p: float = 0.8,
-        top_k: float = 10.0,
-    ):
-        super().__init__(key=key,
-                         path=path,
-                         max_seq_len=max_seq_len,
-                         query_per_second=query_per_second,
-                         meta_template=meta_template,
-                         retry=retry)
-        # Replace the url and headers into AllesApin
-        self.url = url
-        self.headers = {
-            'alles-apin-token': key,
-            'content-type': 'application/json',
-        }
-
-    def generate(
-        self,
-        inputs: List[PromptType],
-        max_out_len: int = 512,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[PromptType]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-        return super().generate(inputs, max_out_len)
diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py
index 3974ad52..a91ffea0 100644
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@@ -289,13 +289,13 @@ class HuggingFace(BaseModel):
         tokens = self.tokenizer.batch_encode_plus(inputs,
                                                   padding=True,
                                                   truncation=True,
-                                                  max_length=self.max_seq_len -
-                                                  max_out_len)
+                                                  max_length=self.max_seq_len)
         tokens = {
             k: torch.tensor(np.array(tokens[k]), device=self.model.device)
             for k in tokens if k in ['input_ids', 'attention_mask']
         }
 
+        origin_stopping_criteria = stopping_criteria
         if stopping_criteria:
             # Construct huggingface stopping criteria
             if self.tokenizer.eos_token is not None:
@@ -332,6 +332,9 @@ class HuggingFace(BaseModel):
 
         if self.end_str:
             decodeds = [token.split(self.end_str)[0] for token in decodeds]
+        if origin_stopping_criteria:
+            for t in origin_stopping_criteria:
+                decodeds = [token.split(t)[0] for token in decodeds]
         return decodeds
 
     def _single_generate(self,
@@ -382,6 +385,7 @@ class HuggingFace(BaseModel):
                                    max_length=self.max_seq_len -
                                    max_out_len)['input_ids']
         input_ids = torch.tensor(input_ids, device=self.model.device)
+        origin_stopping_criteria = stopping_criteria
         if stopping_criteria:
             # Construct huggingface stopping criteria
             if self.tokenizer.eos_token is not None:
@@ -419,6 +423,9 @@ class HuggingFace(BaseModel):
 
         if self.end_str:
             decodeds = [token.split(self.end_str)[0] for token in decodeds]
+        if origin_stopping_criteria:
+            for t in origin_stopping_criteria:
+                decodeds = [token.split(t)[0] for token in decodeds]
         return decodeds
 
     def get_logits(self, inputs: List[str]):
diff --git a/opencompass/models/minimax_api.py b/opencompass/models/minimax_api.py
index c069dc97..44ea267a 100644
--- a/opencompass/models/minimax_api.py
+++ b/opencompass/models/minimax_api.py
@@ -180,3 +180,173 @@ class MiniMax(BaseAPIModel):
             max_num_retries += 1
 
         raise RuntimeError(response.text)
+
+
+class MiniMaxChatCompletionV2(BaseAPIModel):
+    """Model wrapper around MiniMax ChatCompletionV2.
+
+    Documentation:
+
+    Args:
+        path (str): The name of MiniMax model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        key: str,
+        url: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer ' + key,
+        }
+        self.url = url
+        self.model = path
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for item in input:
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
+
+        data = {
+            'model': self.model,
+            'messages': messages,
+            'max_tokens': max_out_len
+        }
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                headers=self.headers,
+                                                json=data)
+            except Exception as err:
+                print('Request Error:{}'.format(err))
+                time.sleep(2)
+                continue
+
+            response = raw_response.json()
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+
+            if raw_response.status_code == 200:
+                try:
+                    msg = response['choices'][0]['message']['content']
+                    self.logger.debug(f'Generated: {msg}')
+                    return msg
+                except Exception:
+                    code = response.get('base_resp', {}).get('status_code')
+                    if code == 1002:
+                        # rate limit
+                        time.sleep(1)
+                        continue
+                    elif code == 1027:
+                        return 'The request was rejected because high risk'
+                    print(messages, response)
+                    pass
+
+            elif raw_response.status_code == 401:
+                print('请求被拒绝 api_key错误')
+                continue
+            elif raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response)
diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py
index 1f34cd2c..d22c0785 100644
--- a/opencompass/models/qwen_api.py
+++ b/opencompass/models/qwen_api.py
@@ -152,8 +152,7 @@ class Qwen(BaseAPIModel):
             if response.status_code == 200:
                 try:
                     msg = response.output.text
-                    print('=' * 128)
-                    print(msg)
+                    self.logger.debug(msg)
                     return msg
                 except KeyError:
                     print(response)
diff --git a/opencompass/models/stepfun_api.py b/opencompass/models/stepfun_api.py
new file mode 100644
index 00000000..54d1a82b
--- /dev/null
+++ b/opencompass/models/stepfun_api.py
@@ -0,0 +1,182 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class StepFun(BaseAPIModel):
+    """Model wrapper around StepFun.
+
+    Documentation:
+
+    Args:
+        path (str): The name of StepFun model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        key: str,
+        url: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+        system_prompt: str = '',
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer ' + key,
+        }
+        self.url = url
+        self.model = path
+        self.system_prompt = system_prompt
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for item in input:
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
+
+        if self.system_prompt:
+            system = {'role': 'system', 'content': self.system_prompt}
+            messages.insert(0, system)
+
+        data = {'model': self.model, 'messages': messages}
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                headers=self.headers,
+                                                json=data)
+            except Exception as err:
+                print('Request Error:{}'.format(err))
+                time.sleep(2)
+                continue
+
+            try:
+                response = raw_response.json()
+            except Exception:
+                response = None
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+
+            if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
+                msg = response['choices'][0]['message']['content']
+                self.logger.debug(f'Generated: {msg}')
+                return msg
+
+            if raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The context length exceeded'
+                return msg
+            elif raw_response.status_code == 403:
+                print('请求被拒绝 api_key错误')
+                continue
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            elif raw_response.status_code == 451:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response)
diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py
index f64249bc..8e573135 100644
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@@ -55,9 +55,6 @@ class TurboMindModel(BaseModel):
         if engine_config is not None:
             from lmdeploy.messages import TurbomindEngineConfig
             engine_config = TurbomindEngineConfig(**engine_config)
-        if gen_config is not None:
-            from lmdeploy.messages import EngineGenerationConfig
-            gen_config = EngineGenerationConfig(**gen_config)
         self.logger = get_logger()
         tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
         self.tokenizer = tm_model.tokenizer
@@ -106,6 +103,7 @@ class TurboMindModel(BaseModel):
                 t = self.tokenizer.encode(t, add_bos=False)
                 stop_words.append(t[0])
             gen_config['stop_words'] = list(set(stop_words))
+        gen_config.setdefault('min_new_tokens', 1)
 
         from lmdeploy.messages import EngineGenerationConfig
         gen_config = EngineGenerationConfig(**gen_config)
@@ -123,6 +121,9 @@ class TurboMindModel(BaseModel):
                         [gen_config] * len(batch_input),
                     ))
                 results += _results
+        if stopping_criteria:
+            for s in stopping_criteria:
+                results = [r.split(s)[0] for r in results]
         return results
 
     def get_token_len(self, prompt: str) -> int:
diff --git a/opencompass/models/xunfei_api.py b/opencompass/models/xunfei_api.py
index ee75ca12..8de6f607 100644
--- a/opencompass/models/xunfei_api.py
+++ b/opencompass/models/xunfei_api.py
@@ -1,4 +1,6 @@
 import json
+import re
+import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
@@ -221,3 +223,150 @@ class XunFei(BaseAPIModel):
         if err_code == 10013:
             return err_data['header']['message']
         raise RuntimeError(f'Code: {err_code}, data: {err_data}')
+
+
+class XunFeiSpark(BaseAPIModel):
+    """Model wrapper around XunFeiSpark.
+
+    Documentation:
+
+    Args:
+        path (str): The name of XunFeiSpark model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        url: str,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        try:
+            from sparkai.llm.llm import ChatSparkLLM  # noqa: F401
+        except ImportError:
+            raise ImportError('run `pip install --upgrade spark_ai_python`')
+
+        self.spark_domain = path
+        self.url = url
+        self.app_id = app_id
+        self.api_key = api_key
+        self.api_secret = api_secret
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        results = [self._generate(input, max_out_len) for input in inputs]
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        assert isinstance(input, (str, PromptList))
+
+        from sparkai.core.messages import ChatMessage
+        from sparkai.llm.llm import ChatSparkLLM
+
+        if isinstance(input, str):
+            messages = [ChatMessage(role='user', content=input)]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for index, item in enumerate(input):
+                if index == 0 and item['role'] == 'SYSTEM':
+                    role = 'system'
+                elif item['role'] == 'BOT':
+                    role = 'assistant'
+                else:
+                    role = 'user'
+
+                if role != last_role and last_role is not None:
+                    content = '\n'.join(msg_buffer)
+                    messages.append(
+                        ChatMessage(role=last_role, content=content))
+                    msg_buffer = []
+
+                msg_buffer.append(item['prompt'])
+                last_role = role
+
+            content = '\n'.join(msg_buffer)
+            messages.append(ChatMessage(role=last_role, content=content))
+
+        spark = ChatSparkLLM(
+            spark_api_url=self.url,
+            spark_app_id=self.app_id,
+            spark_api_key=self.api_key,
+            spark_api_secret=self.api_secret,
+            spark_llm_domain=self.spark_domain,
+            streaming=False,
+            max_tokens=max_out_len,
+        )
+
+        all_empty_response = True
+        for _ in range(self.retry + 1):
+            try:
+                outputs = spark.generate([messages]).generations[0]
+                if len(outputs) == 0:
+                    self.logger.error('Empty response, retrying...')
+                    continue
+                msg = outputs[0].text
+                self.logger.debug(f'Generated: {msg}')
+                return msg
+            except ConnectionError as e:
+                match = re.match(r'Error Code: (\d+), Error: (.*)',
+                                 e.args[0],
+                                 flags=re.DOTALL)
+                if match:
+                    error_code = int(match.group(1))
+                    msg = match.group(2)
+                    if error_code == 10003:  # query data exceed limit
+                        self.logger.error(f'Error {error_code}: {msg}')
+                        return msg
+                    elif error_code in [10013, 10014]:  # skip safety problem
+                        self.logger.debug(f'Generated: {msg}')
+                        return msg
+                    elif error_code == 10020:  # plugin result is empty
+                        self.logger.error(f'Error {error_code}: {msg}')
+                        return msg
+                    elif error_code == 11202:  # qps limit
+                        time.sleep(1)
+                    else:
+                        self.logger.error(f'Error {error_code}: {msg}')
+                        raise e
+                raise e
+            except TimeoutError:
+                self.logger.error('TimeoutError, sleep 60, retrying...')
+                time.sleep(60)
+            except Exception as e:
+                self.logger.error(str(e))
+                pass
+
+            all_empty_response = False
+
+        if all_empty_response:
+            self.logger.error('All empty response')
+            return 'all empty response'
+
+        raise RuntimeError('Failed to generate response')
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 55fcb62a..bc4ca0dd 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -141,7 +141,7 @@ class DLCRunner(BaseRunner):
 
             hf_offline = self.aliyun_cfg.get('hf_offline', True)
             if hf_offline:
-                shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; '  # noqa: E501
+                shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; export HF_HUB_OFFLINE=1; '  # noqa: E501
 
             http_proxy = self.aliyun_cfg.get('http_proxy')
             if http_proxy is not None:
@@ -158,6 +158,7 @@ class DLCRunner(BaseRunner):
                     shell_cmd += f'export {extra_env}; '
 
             shell_cmd += f'cd {pwd}; '
+            shell_cmd += 'umask 0000; '
             shell_cmd += '{task_cmd}'
 
             tmpl = ('dlc create job'
@@ -195,7 +196,10 @@ class DLCRunner(BaseRunner):
                 index_to_start = 0
                 while index_to_start < num_retry_to_start:
                     index_to_start += 1
-                    output = subprocess.getoutput(cmd)
+                    try:
+                        output = subprocess.getoutput(cmd)
+                    except BlockingIOError:
+                        output = ''
                     match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
                     if match is None:
                         stdout.write('Failed to get job id from output:')
@@ -264,7 +268,10 @@ class DLCRunner(BaseRunner):
                                 f" -c {self.aliyun_cfg['dlc_config_path']}"
                                 f' --start_time {pri_time}'
                                 f' --end_time {cur_time}')
-                    log_output = subprocess.getoutput(logs_cmd)
+                    try:
+                        log_output = subprocess.getoutput(logs_cmd)
+                    except BlockingIOError:
+                        log_output = '[WARN] No logs found for the pod'
 
                     if '[WARN] No logs found for the pod' not in log_output:
                         pri_time = cur_time
diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py
index c7d3632d..3be17e4e 100644
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@@ -46,17 +46,19 @@ class LocalRunner(BaseRunner):
         lark_bot_url (str): Lark bot url.
     """
 
-    def __init__(
-        self,
-        task: ConfigDict,
-        max_num_workers: int = 16,
-        debug: bool = False,
-        max_workers_per_gpu: int = 1,
-        lark_bot_url: str = None,
-    ):
+    def __init__(self,
+                 task: ConfigDict,
+                 max_num_workers: int = 16,
+                 debug: bool = False,
+                 max_workers_per_gpu: int = 1,
+                 lark_bot_url: str = None,
+                 **kwargs):
         super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
         self.max_num_workers = max_num_workers
         self.max_workers_per_gpu = max_workers_per_gpu
+        logger = get_logger()
+        for k, v in kwargs.items():
+            logger.warning(f'Ignored argument in {self.__module__}: {k}={v}')
 
     def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
         """Launch multiple tasks.
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index 23ce8b77..b1b0c173 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -94,11 +94,11 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
         f'答案是\s?(\S+)(?:。|$)',
         f'答案应该是\s?(\S+)(?:。|$)',
         f'答案为\s?(\S+)(?:。|$)',
-        f'[Tt]he answer is \(?([{options}])\)?',
-        f'[Tt]he answer is option \(?([{options}])\)?',
-        f'[Tt]he correct answer is \(?([{options}])\)?',
-        f'[Tt]he correct answer is option \(?([{options}])\)?',
-        f'[Tt]he answer to the question is \(?([{options}])\)?',
+        f'[Tt]he answer is:?\s+\(?([{options}])\)?',
+        f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
+        f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
+        f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
+        f'[Tt]he answer to the question is:?\s+\(?([{options}])\)?',
         f'^选项\s?([{options}])',
         f'^([{options}])\s?选?项',
         f'(\s|^)[{options}][\s。，,：:\.$]',
@@ -116,7 +116,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
     if cushion:
         patterns.extend(cushion_patterns)
     for pattern in patterns:
-        match = re.search(pattern, text)
+        match = re.search(pattern, text, re.DOTALL)
         if match:
             outputs = match.group(0)
             for i in options: