diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 63e3d484..f03049e7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,6 +10,7 @@ exclude: |
opencompass/datasets/teval/|
opencompass/datasets/NPHardEval/|
opencompass/datasets/TheoremQA|
+ opencompass/datasets/subjective/mtbench101.py|
docs/zh_cn/advanced_guides/compassbench_intro.md
)
repos:
@@ -35,6 +36,7 @@ repos:
exclude: |
(?x)^(
.*\.jsonl|
+ .*\.md.template|
configs/
)
- repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/configs/datasets/CHARM/README.md b/configs/datasets/CHARM/README.md
new file mode 100644
index 00000000..b297f852
--- /dev/null
+++ b/configs/datasets/CHARM/README.md
@@ -0,0 +1,110 @@
+# CHARM✨ Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations [ACL2024]
+[](https://arxiv.org/abs/2403.14112)
+[](./LICENSE)
+
+
+📃[Paper](https://arxiv.org/abs/2403.14112)
+🏰[Project Page](https://opendatalab.github.io/CHARM/)
+🏆[Leaderboard](https://opendatalab.github.io/CHARM/leaderboard.html)
+✨[Findings](https://opendatalab.github.io/CHARM/findings.html)
+
+
+
+
+
+## Dataset Description
+
+**CHARM** is the first benchmark for comprehensively and in-depth evaluating the commonsense reasoning ability of large language models (LLMs) in Chinese, which covers both globally known and Chinese-specific commonsense. In addition, the CHARM can evaluate the LLMs' memorization-independent reasoning abilities and analyze the typical errors.
+
+## Comparison of commonsense reasoning benchmarks
+
+
+
+"CN-Lang" indicates the benchmark is presented in Chinese language. "CSR" means the benchmark is designed to focus on CommonSense Reasoning. "CN-specific" indicates the benchmark includes elements that are unique to Chinese culture, language, regional characteristics, history, etc. "Dual-Domain" indicates the benchmark encompasses both Chinese-specific and global domain tasks, with questions presented in the similar style and format. "Rea-Mem" indicates the benchmark includes closely-interconnected reasoning and memorization tasks.
+
+
+## 🛠️ How to Use
+Below are the steps for quickly downloading CHARM and using OpenCompass for evaluation.
+
+### 1. Download CHARM
+```bash
+git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo}
+```
+### 2. Run Inference and Evaluation
+```bash
+cd ${path_to_opencompass}
+mkdir -p data
+ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM
+
+# Infering and evaluating CHARM with hf_llama3_8b_instruct model
+python run.py --models hf_llama3_8b_instruct --datasets charm_gen
+```
+
+## 🖊️ Citation
+```bibtex
+@misc{sun2024benchmarking,
+ title={Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations},
+ author={Jiaxing Sun and Weiquan Huang and Jiang Wu and Chenya Gu and Wei Li and Songyang Zhang and Hang Yan and Conghui He},
+ year={2024},
+ eprint={2403.14112},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
diff --git a/configs/datasets/CHARM/README_ZH.md b/configs/datasets/CHARM/README_ZH.md
new file mode 100644
index 00000000..414c65fe
--- /dev/null
+++ b/configs/datasets/CHARM/README_ZH.md
@@ -0,0 +1,108 @@
+# CHARM✨ Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations [ACL2024]
+[](https://arxiv.org/abs/2403.14112)
+[](./LICENSE)
+
+
+📃[Paper](https://arxiv.org/abs/2403.14112)
+🏰[Project Page](https://opendatalab.github.io/CHARM/)
+🏆[Leaderboard](https://opendatalab.github.io/CHARM/leaderboard.html)
+✨[Findings](https://opendatalab.github.io/CHARM/findings.html)
+
+
+
+
+## 数据集介绍
+
+**CHARM** 是首个全面深入评估大型语言模型(LLMs)在中文常识推理能力的基准测试,它覆盖了国际普遍认知的常识以及独特的中国文化常识。此外,CHARM 还可以评估 LLMs 独立于记忆的推理能力,并分析其典型错误。
+
+
+## 与其他常识推理评测基准的比较
+
+
+
+
+## 🛠️ 如何使用
+以下是快速下载 CHARM 并在 OpenCompass 上进行评估的步骤。
+
+### 1. 下载 CHARM
+```bash
+git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo}
+```
+### 2. 推理和评测
+```bash
+cd ${path_to_opencompass}
+mkdir -p data
+ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM
+
+# 在CHARM上对模型hf_llama3_8b_instruct做推理和评测
+python run.py --models hf_llama3_8b_instruct --datasets charm_gen
+```
+
+## 🖊️ 引用
+```bibtex
+@misc{sun2024benchmarking,
+ title={Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations},
+ author={Jiaxing Sun and Weiquan Huang and Jiang Wu and Chenya Gu and Wei Li and Songyang Zhang and Hang Yan and Conghui He},
+ year={2024},
+ eprint={2403.14112},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
diff --git a/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py b/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
new file mode 100644
index 00000000..a9485f44
--- /dev/null
+++ b/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
@@ -0,0 +1,50 @@
+import os
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
+
+with read_base():
+ from .charm_reason_settings import charm_tasks, settings
+
+settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']]
+
+charm_reason_datasets = []
+
+for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
+ for _task in charm_tasks:
+ _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
+ with open(_fewshot_example_file, 'r') as f:
+ _hint = f.read()
+
+ charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+ charm_reason_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=512),
+ )
+
+ charm_reason_eval_cfg = dict(
+ evaluator=dict(type=CharmReasonEvaluator),
+ pred_role='BOT',
+ pred_postprocessor=dict(type=charm_reason_postprocess),
+ dataset_postprocessor=dict(type=charm_reason_postprocess),
+ )
+
+ charm_reason_datasets.append(
+ dict(
+ type=CharmDataset,
+ path=dataset_path,
+ name=_task,
+ abbr='charm-reason-' + _task + '_' + _cot,
+ reader_cfg=charm_reason_reader_cfg,
+ infer_cfg=charm_reason_infer_cfg.copy(),
+ eval_cfg=charm_reason_eval_cfg.copy(),
+ )
+ )
diff --git a/configs/datasets/CHARM/charm_reason_gen.py b/configs/datasets/CHARM/charm_reason_gen.py
new file mode 100644
index 00000000..78ba9697
--- /dev/null
+++ b/configs/datasets/CHARM/charm_reason_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+ from .charm_reason_gen_f8fca2 import charm_reason_datasets # noqa: F401, F403
diff --git a/configs/datasets/CHARM/charm_reason_gen_f8fca2.py b/configs/datasets/CHARM/charm_reason_gen_f8fca2.py
new file mode 100644
index 00000000..07374df9
--- /dev/null
+++ b/configs/datasets/CHARM/charm_reason_gen_f8fca2.py
@@ -0,0 +1,49 @@
+import os
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
+
+with read_base():
+ from .charm_reason_settings import charm_tasks, settings
+
+
+charm_reason_datasets = []
+
+for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
+ for _task in charm_tasks:
+ _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
+ with open(_fewshot_example_file, 'r') as f:
+ _hint = f.read()
+
+ charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+ charm_reason_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=512),
+ )
+
+ charm_reason_eval_cfg = dict(
+ evaluator=dict(type=CharmReasonEvaluator),
+ pred_role='BOT',
+ pred_postprocessor=dict(type=charm_reason_postprocess),
+ dataset_postprocessor=dict(type=charm_reason_postprocess),
+ )
+
+ charm_reason_datasets.append(
+ dict(
+ type=CharmDataset,
+ path=dataset_path,
+ name=_task,
+ abbr='charm-reason-' + _task + '_' + _cot,
+ reader_cfg=charm_reason_reader_cfg,
+ infer_cfg=charm_reason_infer_cfg.copy(),
+ eval_cfg=charm_reason_eval_cfg.copy(),
+ )
+ )
diff --git a/configs/datasets/CHARM/charm_reason_settings.py b/configs/datasets/CHARM/charm_reason_settings.py
new file mode 100644
index 00000000..b5c4fbde
--- /dev/null
+++ b/configs/datasets/CHARM/charm_reason_settings.py
@@ -0,0 +1,36 @@
+import os
+
+charm_tasks = [
+ 'Chinese_Anachronisms_Judgment',
+ 'Chinese_Movie_and_Music_Recommendation',
+ 'Chinese_Natural_Language_Inference',
+ 'Chinese_Reading_Comprehension',
+ 'Chinese_Sequence_Understanding',
+ 'Chinese_Sport_Understanding',
+ 'Chinese_Time_Understanding',
+ 'Global_Anachronisms_Judgment',
+ 'Global_Movie_and_Music_Recommendation',
+ 'Global_Natural_Language_Inference',
+ 'Global_Reading_Comprehension',
+ 'Global_Sequence_Understanding',
+ 'Global_Sport_Understanding',
+ 'Global_Time_Understanding',
+]
+
+XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
+Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:'
+
+data_dir = 'data/CHARM'
+dataset_path_ZH = f'{data_dir}/reasoning'
+dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN'
+fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples')
+fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN')
+
+settings = [
+ ('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
+ ('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
+ ('EN-CoT', "Let's think step by step.", dataset_path_ZH, fewshot_example_path_ZH, Other_template),
+ ('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template),
+ ('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template),
+]
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt
new file mode 100644
index 00000000..c70e110c
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt
@@ -0,0 +1,22 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+李白用钢笔写诗。
+选项:
+(A) 是
+(B) 否
+A:(A)
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+唐朝时期,人们可以在街头买到《红楼梦》。
+选项:
+(A) 是
+(B) 否
+A:(A)
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+1998年10月1日,人们手举五星红旗,在天安门广场上庆祝国庆
+选项:
+(A) 是
+(B) 否
+A:(B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt
new file mode 100644
index 00000000..4de255f6
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt
@@ -0,0 +1,25 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+李白用钢笔写诗。
+选项:
+(A) 是
+(B) 否
+A:Let's think step by step.
+This statement mentions "Li Bai", a poet from the Tang Dynasty in China. The "pen" mentioned in the statement is a modern device, so it is impossible for Li Bai to write poetry with a pen. This statement contains errors from the times. So the answer is (A).
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+唐朝时期,人们可以在街头买到《红楼梦》。
+选项:
+(A) 是
+(B) 否
+A:Let's think step by step.
+This statement mentions "Dream of the Red Chamber", which was written by Qing Dynasty writer Cao Xueqin. There was no "Dream of the Red Chamber" during the Tang Dynasty, so this statement contains historical errors. So the answer is (A).
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+1998年10月1日,人们手举五星红旗,在天安门广场上庆祝国庆
+选项:
+(A) 是
+(B) 否
+A:Let's think step by step.
+This statement mentions that in 1998, New China was established in 1949, and the five-star red flag was designated as the national flag of China. Therefore, October 1, 1998 is National Day, and it is reasonable for people to celebrate National Day at Tiananmen Square, excluding historical errors. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt
new file mode 100644
index 00000000..36948e20
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt
@@ -0,0 +1,63 @@
+识别给定陈述是否包含时代错误。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+李白用钢笔写诗。
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: How would a typical person answer each of the following statement whether contains an anachronism?
+Li Bai writes poetry with a pen.
+Option:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.This statement mentions "Li Bai", a poet from the Tang Dynasty in China.
+2.The pen mentioned in the statement is a modern device.
+3.so it is impossible for Li Bai to write poetry with a pen. This statement contains errors from the times.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+唐朝时期,人们可以在街头买到《红楼梦》。
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: How would a typical person answer each of the following statement whether contains an anachronism?
+During the Tang Dynasty, people could buy "Dream of the Red Chamber" on the streets.
+Option:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.This statement mentions "Dream of the Red Chamber", which was written by Qing Dynasty writer Cao Xueqin
+2.During the Tang Dynasty, there was no "Dream of the Red Chamber", so this statement contains historical errors.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+1998年10月1日,人们手举五星红旗,在天安门广场上庆祝国庆
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: How would a typical person answer each of the following statement whether contains an anachronism?
+On October 1, 1998, people held five-star red flags and celebrated National Day on Tiananmen Square
+Option:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.This statement mentions that in 1998, New China was established in 1949
+2.The Five Star Red Flag was designated as the national flag of China in 1949.
+3.So October 1st, 1998 is National Day, and it is reasonable for people to celebrate National Day at Tiananmen Square, so the statement does not contain any historical errors.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt
new file mode 100644
index 00000000..4b71c4b3
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt
@@ -0,0 +1,25 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+李白用钢笔写诗。
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了“李白”,他是中国唐朝时期的诗人。而陈述中提到的“钢笔”是现代设备,因此李白不可能使用钢笔写诗,该陈述包含时代错误。所以答案是(A)。
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+唐朝时期,人们可以在街头买到《红楼梦》。
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了《红楼梦》,《红楼梦》是清代作家曹雪芹所写,唐朝时还没有《红楼梦》,因此该陈述包含时代错误。所以答案是(A)。
+
+Q:以下陈述是否包含时代错误,请选择正确选项。一个接受了义务教育、具备基本常识的人会如何选择?
+1998年10月1日,人们手举五星红旗,在天安门广场上庆祝国庆
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了1998年,新中国是1949年成立的,五星红旗在1949年被确定为中国国旗,因此1998年10月1日是国庆节,人们在天安门庆祝国庆是合理的,因此陈述不包含时代错误。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt
new file mode 100644
index 00000000..4a391abe
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt
@@ -0,0 +1,25 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 和这些电影《疯狂的外星人》、《斗牛》、《杀生》、《疯狂的石头》有共同点的电影是:
+选项:
+(A)《泰囧》
+(B)《少年派》
+(C)《江湖儿女》
+(D)《湄公河行动》
+A: (A)
+
+Q: 和这些电影《红高梁》、《活着》、《大红灯笼高高挂》、《英雄》有共同点的电影是:
+选项:
+(A)《一个都不能少》
+(B)《让子弹飞》
+(C)《阿飞正传》
+(D)《东邪西毒》
+A: (A)
+
+Q: 和这些歌曲《夜曲》、《本草纲目》、《听妈妈的话》、《七里香》有共同点的歌曲是:
+选项:
+(A)《双节棍》
+(B)《年少有为》
+(C)《浮夸》
+(D)《三人游》
+A: (A)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt
new file mode 100644
index 00000000..272492bd
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt
@@ -0,0 +1,40 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 和这些电影《疯狂的外星人》、《斗牛》、《杀生》、《疯狂的石头》有共同点的电影是:
+选项:
+(A)《泰囧》
+(B)《少年派》
+(C)《江湖儿女》
+(D)《湄公河行动》
+A:Let's think step by step.
+"Crazy Alien" is a comedy science fiction film directed by Ning Hao, written by Liu Cixin and Sun Xiaohang, and starring Huang Bo, Shen Teng, and Xu Zheng. It was released in 2019.
+"Cow" is a dark comedy film directed by Guan Hu, starring Huang Bo and Yan Ni. It was released in 2009.
+"Design of Death" is an absurd and suspenseful comedy film directed by Guan Hu, featuring Huang Bo, Simon Yam, Su Youpeng, and Yu Nan. It was released in 2012.
+"Crazy Stone" is a dark comedy film directed by Ning Hao, featuring Guo Tao, Liu Hua, Lian Jin, Liu Gang, Xu Zheng, and Huang Bo. It was released in 2006.
+These are all famous classic Chinese comedy films featuring Huang Bo. The only film among the options that seems to have something in common with these films is "Lost in Thailand" (directed by Xu Zheng, starring Huang Bo, Xu Zheng, and Wang Baoqiang), a comedy film released in 2012. So the answer is (A).
+
+Q: 和这些电影《红高梁》、《活着》、《大红灯笼高高挂》、《英雄》有共同点的电影是:
+选项:
+(A)《一个都不能少》
+(B)《让子弹飞》
+(C)《阿飞正传》
+(D)《东邪西毒》
+A:Let's think step by step.
+"Red Sorghum," directed by Zhang Yimou and starring Jiang Wen, Gong Li, and Teng Rujun, is a war drama film that was released in China in 1987.
+"To Live," directed by Zhang Yimou and starring Ge You and Gong Li, is a drama film that was released in China in 1994.
+"Raise the Red Lantern," directed by Zhang Yimou and starring Gong Li, He Saifei, Ma Jingwu, Cao Cuifen, Kong Lin, and Jin Shuyuan, is a drama film that was released in China in 1991.
+"Hero," directed by Zhang Yimou and starring Jet Li, Tony Leung, Maggie Cheung, Chen Daoming, Zhang Ziyi, and Donnie Yen, is a wuxia film that was released in China in 2002.
+These are all famous classic Chinese films directed by Zhang Yimou. The only film among the options that seems to have something in common with these films is "Not One Less" (directed by Zhang Yimou, starring Wei Minzhi and Zhang Huike), a drama film released in 1999. So the answer is (A).
+
+Q: 和这些歌曲《夜曲》、《本草纲目》、《听妈妈的话》、《七里香》有共同点的歌曲是:
+选项:
+(A)《双节棍》
+(B)《年少有为》
+(C)《浮夸》
+(D)《三人游》
+A:Let's think step by step.
+"Nocturne" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Michael Lin. It is included in Jay Chou's 2005 album "November's Chopin."
+"Herbalist's Manual" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Michael Lin. It is included in Jay Chou's 2006 album "Still Fantasy."
+"Listen to Your Mother" is a song performed by Jay Chou, with lyrics and music by Jay Chou, arrangement by Michael Lin and Hong Jingyao. It is included in Jay Chou's 2006 album "Still Fantasy."
+"Common Jasmine Orange" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Chung Hsin-min. It is included in Jay Chou's self-titled album "Common Jasmine Orange" released in 2004.
+These are all famous pop songs performed by Jay Chou. The only song among the options that seems to have something in common with these songs is "Nunchucks" (performed by Jay Chou, composed by Jay Chou, lyrics by Vincent Fang, arrangement by Chung Hsin-min, included in Jay Chou's 2001 album "Fantasy"). So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt
new file mode 100644
index 00000000..0e69a8d0
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt
@@ -0,0 +1,76 @@
+给根据给定艺术作品清单,找出最类似的。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:和这些电影《疯狂的外星人》、《斗牛》、《杀生》、《疯狂的石头》有共同点的电影是:
+选项:
+(A)《泰囧》
+(B)《少年派》
+(C)《江湖儿女》
+(D)《湄公河行动》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:The movie that has something in common with these movies Crazy Aliens, Bullitt, Killjoys and Crazy Stone is:
+Options.
+(A)Lost in Thailand
+(B)The Young and the Restless
+(C)The Children of the River and the Lake
+(D)The Mekong Operation
+Step-by-step answer:
+1."Crazy Alien" is a comedy science fiction film directed by Ning Hao, written by Liu Cixin and Sun Xiaohang, and starring Huang Bo, Shen Teng, and Xu Zheng. It was released in 2019.
+2."Cow" is a dark comedy film directed by Guan Hu, starring Huang Bo and Yan Ni. It was released in 2009.
+3."Design of Death" is an absurd and suspenseful comedy film directed by Guan Hu, featuring Huang Bo, Simon Yam, Su Youpeng, and Yu Nan. It was released in 2012.
+4."Crazy Stone" is a dark comedy film directed by Ning Hao, featuring Guo Tao, Liu Hua, Lian Jin, Liu Gang, Xu Zheng, and Huang Bo. It was released in 2006.
+5.These are all famous classic Chinese comedy films featuring Huang Bo. The only film among the options that seems to have something in common with these films is "Lost in Thailand" (directed by Xu Zheng, starring Huang Bo, Xu Zheng, and Wang Baoqiang), a comedy film released in 2012.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:和这些电影《红高梁》、《活着》、《大红灯笼高高挂》、《英雄》有共同点的电影是:
+选项:
+(A)《一个都不能少》
+(B)《让子弹飞》
+(C)《阿飞正传》
+(D)《东邪西毒》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:The movie that has something in common with these movies 'Red High Beam', 'Alive', 'Big Red Lantern' and 'Hero' is:
+Option.
+(A) 'Not One Less'
+(B)Let the Bullets Fly
+(C)The Legend of Ah Fei
+(D)East is East, West is West
+Step-by-step answer:
+1."Red Sorghum," directed by Zhang Yimou and starring Jiang Wen, Gong Li, and Teng Rujun, is a war drama film that was released in China in 1987.
+2."To Live," directed by Zhang Yimou and starring Ge You and Gong Li, is a drama film that was released in China in 1994.
+3."Raise the Red Lantern," directed by Zhang Yimou and starring Gong Li, He Saifei, Ma Jingwu, Cao Cuifen, Kong Lin, and Jin Shuyuan, is a drama film that was released in China in 1991.
+4."Hero," directed by Zhang Yimou and starring Jet Li, Tony Leung, Maggie Cheung, Chen Daoming, Zhang Ziyi, and Donnie Yen, is a wuxia film that was released in China in 2002.
+5.These are all famous classic Chinese films directed by Zhang Yimou. The only film among the options that seems to have something in common with these films is "Not One Less" (directed by Zhang Yimou, starring Wei Minzhi and Zhang Huike), a drama film released in 1999.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:和这些歌曲《夜曲》、《本草纲目》、《听妈妈的话》、《七里香》有共同点的歌曲是:
+选项:
+(A)《双节棍》
+(B)《年少有为》
+(C)《浮夸》
+(D)《三人游》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:The songs that have something in common with these songs "Nocturne", "Materia Medica", "Listen to Your Mother" and "Seven Miles" are:
+Option.
+(A) 'Nunchaku'
+(B) 'The Young and the Restless'
+(C) 'Pomp and Circumstance'
+(D) "Three's a Crowd
+Step-by-step answer:
+1."Nocturne" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Michael Lin. It is included in Jay Chou's 2005 album "November's Chopin."
+2."Herbalist's Manual" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Michael Lin. It is included in Jay Chou's 2006 album "Still Fantasy."
+3."Listen to Your Mother" is a song performed by Jay Chou, with lyrics and music by Jay Chou, arrangement by Michael Lin and Hong Jingyao. It is included in Jay Chou's 2006 album "Still Fantasy."
+4."Common Jasmine Orange" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Chung Hsin-min. It is included in Jay Chou's self-titled album "Common Jasmine Orange" released in 2004.
+5.These are all famous pop songs performed by Jay Chou. The only song among the options that seems to have something in common with these songs is "Nunchucks" (performed by Jay Chou, composed by Jay Chou, lyrics by Vincent Fang, arrangement by Chung Hsin-min, included in Jay Chou's 2001 album "Fantasy").
+So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt
new file mode 100644
index 00000000..f11dd14a
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt
@@ -0,0 +1,40 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 和这些电影《疯狂的外星人》、《斗牛》、《杀生》、《疯狂的石头》有共同点的电影是:
+选项:
+(A)《泰囧》
+(B)《少年派》
+(C)《江湖儿女》
+(D)《湄公河行动》
+A: 让我们一步一步来思考。
+《疯狂的外星人》是由宁浩执导,刘慈欣、孙小杭编剧,黄渤、沈腾、徐峥等主演的喜剧科幻片,2019年上映。
+《斗牛》是由管虎执导,黄渤、闫妮等主演的黑色喜剧电影,2009年上映。
+《杀生》是由管虎执导,黄渤、任达华、苏有朋、余男等联袂主演的荒诞悬疑喜剧片,2012年上映。
+《疯狂的石头》是宁浩执导,郭涛、刘桦、连晋、刘刚、徐峥、黄渤等出演的黑色喜剧片,2006年上映。
+这些都是有黄渤出演的著名经典中国喜剧电影,在所有选项中,唯一与这些电影有相同点的电影似乎是《泰囧》(徐峥执导,黄渤、徐峥、王宝强主演的喜剧片,2012年上映)。所以答案是(A)。
+
+Q: 和这些电影《红高梁》、《活着》、《大红灯笼高高挂》、《英雄》有共同点的电影是:
+选项:
+(A)《一个都不能少》
+(B)《让子弹飞》
+(C)《阿飞正传》
+(D)《东邪西毒》
+A: 让我们一步一步来思考。
+《红高粱》由张艺谋执导,姜文、巩俐、滕汝骏等主演的战争文艺片,1987年在中国上映。
+《活着》是由张艺谋执导,葛优、巩俐等主演的剧情片,1994年在中国上映。
+《大红灯笼高高挂》是由张艺谋执导,巩俐、何赛飞、马精武、曹翠芬、孔琳、金淑媛等主演的剧情片,1991年在中国上映。
+《英雄》是张艺谋执导,由李连杰、梁朝伟、张曼玉、陈道明、章子怡及甄子丹主演的的武侠电影,2002年在中国上映。
+这些都是由张艺谋执导的著名经典中国电影,在所有选项中,唯一与这些电影有相同点的电影似乎是《一个都不能少》(张艺谋执导,魏敏芝、张慧科主演的剧情电影,1999年上映)。所以答案是(A)。
+
+Q: 和这些歌曲《夜曲》、《本草纲目》、《听妈妈的话》、《七里香》有共同点的歌曲是:
+选项:
+(A)《双节棍》
+(B)《年少有为》
+(C)《浮夸》
+(D)《三人游》
+A: 让我们一步一步来思考。
+《夜曲》是周杰伦演唱的一首歌曲,由方文山作词,周杰伦作曲,林迈可编曲,收录在周杰伦2005年发行的专辑《11月的萧邦》中
+《本草纲目》是周杰伦演唱的一首歌曲,由方文山作词,周杰伦作曲,林迈可编曲,收录在周杰伦2006年发行的专辑《依然范特西》中。
+《听妈妈的话》是周杰伦演唱的一首歌曲,由周杰伦作词、作曲,林迈可、洪敬尧编曲,收录在周杰伦2006年发行的专辑《依然范特西》中。
+《七里香》是周杰伦演唱的一首歌曲,由方文山作词,周杰伦谱曲,钟兴民编曲,收录在周杰伦2004年发行的同名专辑《七里香》中。
+这些都是由周杰伦演唱的中国著名流行音乐歌曲,在所有选项中,唯一与这些歌曲有相同点的歌曲似乎是《双节棍》(由周杰伦演唱,由周杰伦作曲,方文山作词,钟兴民编曲,收录于周杰伦2001年发行的专辑《范特西》中)。所以答案是(A)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt
new file mode 100644
index 00000000..83aa4179
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt
@@ -0,0 +1,25 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:小明刚刚去什刹海滑冰
+语句二:北京现在是冬季
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(A)
+
+Q:语句一:下周,一股强降水将影响整个河北省
+语句二:下周,上海天气很好
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(C)
+
+Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好,只是近黄昏"
+语句二:昨天下雨,小明没有出门
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt
new file mode 100644
index 00000000..3ce816cc
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt
@@ -0,0 +1,28 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:小明刚刚去什刹海滑冰
+语句二:北京现在是冬季
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+The first sentence mentions that Xiaoming goes to Shichahai for ice skating, which usually takes place in winter. Moreover, Shichahai is located in Beijing, which contains the message from the second sentence that it is currently winter. So the answer is (A).
+
+Q:语句一:下周,一股强降水将影响整个河北省
+语句二:下周,上海天气很好
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+These two sentences describe the weather conditions in two geographical locations, one in Hebei Province and the other in Shanghai. Hebei Province and Shanghai are geographically far apart, so the weather conditions in these two places may not necessarily be directly related. So, the relationship between these two sentences is irrelevant. So the answer is (C).
+
+Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好,只是近黄昏"
+语句二:昨天下雨,小明没有出门
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+The first sentence states that Xiaoming saw the sunset on the mountain yesterday, while the second sentence states that it rained yesterday and Xiaoming did not go out. There is a contradiction between these two sentences, because if Xiaoming had not gone out, he could not have seen the sunset on the mountain. So, the relationship between these two sentences is contradictory. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt
new file mode 100644
index 00000000..ccf8c4de
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt
@@ -0,0 +1,67 @@
+请根据题目中两句话的关系选择正确答案。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:小明刚刚去什刹海滑冰
+语句二:北京现在是冬季
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1: Xiaoming has just gone ice-skating in Shichahai
+Sentence 2: It's winter in Beijing
+What is the relationship between these two statements?
+(A) Implicit
+(B) Contradictory
+(C) Irrelevant
+Step-by-step answer:
+1.The first sentence mentions that Xiaoming goes to Shichahai for ice skating, which usually takes place in winter.
+2.Moreover, Shichahai is located in Beijing, which contains the message from the second sentence that it is currently winter.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:下周,一股强降水将影响整个河北省
+语句二:下周,上海天气很好
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1:Next week, a heavy rainfall will affect the whole Hebei province
+Sentence 2: Next week, the weather in Shanghai will be fine.
+What is the relationship between these two statements?
+(A) Implied
+(B) Contradictory
+(C) Irrelevant
+Step-by-step answer:
+1.These two sentences describe the weather conditions in two geographical locations, one in Hebei Province and the other in Shanghai.
+2.Hebei Province and Shanghai are geographically far apart, so the weather conditions in these two places may not necessarily be directly related. So, the relationship between these two sentences is irrelevant.
+So the answer is (C).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好,只是近黄昏"
+语句二:昨天下雨,小明没有出门
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1: Yesterday, Xiao Ming watched the sunset on a hill and exclaimed, "The sunset is infinite, but it's just near dusk"
+Sentence 2: Yesterday it rained and Ming didn't go out
+What is the relationship between these two statements?
+(A) implied
+(B) contradictory
+(C) Irrelevant
+Step-by-step answer:
+1.The first sentence states that Xiaoming saw the sunset on the mountain yesterday, while the second sentence states that it rained yesterday and Xiaoming did not go out.
+2.There is a contradiction between these two sentences, because if Xiaoming had not gone out, he could not have seen the sunset on the mountain. So, the relationship between these two sentences is contradictory.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt
new file mode 100644
index 00000000..de824780
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt
@@ -0,0 +1,28 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:小明刚刚去什刹海滑冰
+语句二:北京现在是冬季
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+第一句话提到小明去什刹海滑冰,而滑冰通常在冬季进行,而且什刹海位于北京,这蕴含了第二句话的信息,即当前是冬季。所以答案是(A)。
+
+Q:语句一:下周,一股强降水将影响整个河北省
+语句二:下周,上海天气很好
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+这两句话描述的是两个地理位置的天气情况,一个是河北省,一个是上海。河北省和上海在地理位置上相距较远,因此,这两个地方的天气情况并不一定有直接关联。所以,这两句话之间的关系是无关的。所以答案是(C)。
+
+Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好,只是近黄昏"
+语句二:昨天下雨,小明没有出门
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+第一句话说小明昨天在山上看到了落日,而第二句话说昨天下雨,小明没有出门。这两句话之间存在矛盾,因为如果小明没有出门,那么他就不可能在山上看到落日。所以,这两句话之间的关系是矛盾的。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt
new file mode 100644
index 00000000..5f8d5381
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt
@@ -0,0 +1,23 @@
+请理解题目含义并选择正确答案。
+
+Q:有些广东人不爱吃辣椒.因此,有些南方人不爱吃辣椒. 以下哪项能保证上述论证的成立?
+(A) 有些广东人爱吃辣椒
+(B) 爱吃辣椒的有些是南方人
+(C) 所有的广东人都是南方人
+(D) 有些广东人不爱吃辣椒也不爱吃甜食
+A:(C)
+
+
+Q:唐卡是极富藏族文化特色的一种绘画形式,自吐蕃王朝兴起至今已有1300多年的历史,是雪域高原的文化瑰宝.它的题材除宗教外,还有历史和民俗内容,故又被称为了解西藏的“百科全书”.所以,想要了解西藏的历史,除了正襟危坐地阅读严谨但略显呆板的史书外,你还可以选择一种惬意和愉悦的方式--欣赏唐卡,与众多的古人对话,想象曾经的历史事件,体味藏族人丰富的精神世界,了解独特的藏族民俗,这是一个让历史变得立体可感的过程. 这段文字意在说明:
+(A) 唐卡可以给大家提供一种惬意轻松的了解西藏的方式
+(B) 唐卡中记录了独特的藏族民俗和曾经的历史事件
+(C) 唐卡是了解西藏文化和历史的“百科全书”式的绘画形式
+(D) 唐卡是极富藏族文化特色且历史悠久的一种绘画形式
+A:(A)
+
+Q:“知人论世”作为一种文学批评的原则和方法,最早由战国时期的思想家孟子提出.孟子认为,后人要交结古人,只是读其诗书是不行的,还必须了解他们的为人行事以及他们的生活的时代,这样,才能读懂古人的诗书,才能和古人心契神交,成为知音. 对这段话的理解,不正确的是?
+(A) 人的心灵是可以互通和共鸣的
+(B) “知人论世”作为一种文学评论发沿用至今并显现了强大的生命力
+(C) “知人论世”可以帮助后人交结古人和古人成为知音
+(D) 了解古人和他所处的时代,有助于理解他的作品
+A: (B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt
new file mode 100644
index 00000000..b4a2e9ad
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt
@@ -0,0 +1,25 @@
+请理解题目含义并选择正确答案。
+
+Q:有些广东人不爱吃辣椒.因此,有些南方人不爱吃辣椒. 以下哪项能保证上述论证的成立?
+(A) 有些广东人爱吃辣椒
+(B) 爱吃辣椒的有些是南方人
+(C) 所有的广东人都是南方人
+(D) 有些广东人不爱吃辣椒也不爱吃甜食
+A: Let's think step by step.
+In this argument, we infer from "some Cantonese people do not like to eat chili peppers" that "some southerners do not like to eat chili peppers". The establishment of this reasoning depends on the relationship between Cantonese and Southerners. In order for this reasoning to be valid, we need to ensure that at least a portion of Cantonese people are from the south. Therefore, option (C) "All Cantonese are southerners" can ensure the validity of this argument. So the answer is (C).
+
+Q:唐卡是极富藏族文化特色的一种绘画形式,自吐蕃王朝兴起至今已有1300多年的历史,是雪域高原的文化瑰宝.它的题材除宗教外,还有历史和民俗内容,故又被称为了解西藏的“百科全书”.所以,想要了解西藏的历史,除了正襟危坐地阅读严谨但略显呆板的史书外,你还可以选择一种惬意和愉悦的方式--欣赏唐卡,与众多的古人对话,想象曾经的历史事件,体味藏族人丰富的精神世界,了解独特的藏族民俗,这是一个让历史变得立体可感的过程. 这段文字意在说明:
+(A) 唐卡可以给大家提供一种惬意轻松的了解西藏的方式
+(B) 唐卡中记录了独特的藏族民俗和曾经的历史事件
+(C) 唐卡是了解西藏文化和历史的“百科全书”式的绘画形式
+(D) 唐卡是极富藏族文化特色且历史悠久的一种绘画形式
+A: Let's think step by step.
+It is explicitly mentioned in the article that besides reading rigorous but somewhat rigid historical books, appreciating thangkas is a comfortable and enjoyable way for people to converse with numerous ancient people, imagine past historical events, appreciate the rich spiritual world of Tibetans, and understand unique Tibetan customs. So the main purpose of this passage is (A) "Thangka can provide a comfortable and easy way for everyone to understand Xizang". So the answer is (A).
+
+Q:“知人论世”作为一种文学批评的原则和方法,最早由战国时期的思想家孟子提出.孟子认为,后人要交结古人,只是读其诗书是不行的,还必须了解他们的为人行事以及他们的生活的时代,这样,才能读懂古人的诗书,才能和古人心契神交,成为知音. 对这段话的理解,不正确的是?
+(A) 人的心灵是可以互通和共鸣的
+(B) “知人论世”作为一种文学评论发沿用至今并显现了强大的生命力
+(C) “知人论世”可以帮助后人交结古人和古人成为知音
+(D) 了解古人和他所处的时代,有助于理解他的作品
+A: Let's think step by step.
+From this passage, we cannot see (B) that "understanding people and discussing the world" as a literary criticism has been used to this day and has shown strong vitality. Although "knowing people and discussing the world" was indeed proposed by the philosopher Mencius during the Warring States period as a principle and method of literary criticism, this passage does not mention that "knowing people and discussing the world" is still in use today, or that it has shown strong vitality. Therefore, option (B) is an incorrect understanding. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt
new file mode 100644
index 00000000..34351fcc
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt
@@ -0,0 +1,62 @@
+请理解题目含义并选择正确答案。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:有些广东人不爱吃辣椒.因此,有些南方人不爱吃辣椒. 以下哪项能保证上述论证的成立?
+(A) 有些广东人爱吃辣椒
+(B) 爱吃辣椒的有些是南方人
+(C) 所有的广东人都是南方人
+(D) 有些广东人不爱吃辣椒也不爱吃甜食
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Some Cantonese people don't like chili peppers. Therefore, some southerners don't like chili peppers. Which of the following ensures the validity of the above argument?
+(A) Some Cantonese people love chili peppers
+(B) Some Southerners love chili peppers.
+(C) All Cantonese are Southerners.
+(D) Some Cantonese people do not love chili or sweets.
+Step-by-step answer:
+1.In this argument, we infer from "some Cantonese people do not like to eat chili peppers" that "some southerners do not like to eat chili peppers".
+2.The establishment of this reasoning depends on the relationship between Cantonese and Southerners. In order for this reasoning to be valid, we need to ensure that at least a portion of Cantonese people are from the south. Therefore, option (C) "All Cantonese are southerners" can ensure the validity of this argument.
+So the answer is (C).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:唐卡是极富藏族文化特色的一种绘画形式,自吐蕃王朝兴起至今已有1300多年的历史,是雪域高原的文化瑰宝.它的题材除宗教外,还有历史和民俗内容,故又被称为了解西藏的“百科全书”.所以,想要了解西藏的历史,除了正襟危坐地阅读严谨但略显呆板的史书外,你还可以选择一种惬意和愉悦的方式--欣赏唐卡,与众多的古人对话,想象曾经的历史事件,体味藏族人丰富的精神世界,了解独特的藏族民俗,这是一个让历史变得立体可感的过程. 这段文字意在说明:
+(A) 唐卡可以给大家提供一种惬意轻松的了解西藏的方式
+(B) 唐卡中记录了独特的藏族民俗和曾经的历史事件
+(C) 唐卡是了解西藏文化和历史的“百科全书”式的绘画形式
+(D) 唐卡是极富藏族文化特色且历史悠久的一种绘画形式
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Thangka is a form of painting rich in Tibetan cultural characteristics, which has a history of more than 1300 years since the rise of the Tubo Dynasty and is a cultural treasure of the Snowy Plateau. It is a cultural treasure of the Snowy Plateau. Its subject matter is not only religious, but also historical and folklore content, so it is also known as the "encyclopedia" to understand Tibet. Therefore, if you want to understand the history of Tibet, in addition to sitting down and reading the strict but slightly dull history books, you can also choose a pleasant and enjoyable way - enjoying the thangka, conversing with many ancient people, imagining the historical events, savoring the rich spiritual world of the Tibetans, and understanding the unique folklore of the Tibetans, which is a process to make the history become a three-dimensional and palpable. This is a process of making history three-dimensional and palpable.
+(A) Thangkas can provide a cozy and relaxing way to learn about Tibet.
+(B) The thangkas are a unique record of Tibetan folklore and historical events.
+(C) The thangka is an "encyclopedic" form of painting for understanding Tibetan culture and history.
+(D) The thangka is a form of painting that is rich in Tibetan cultural characteristics and has a long history.
+Step-by-step answer:
+1.It is explicitly mentioned in the article that besides reading rigorous but somewhat rigid historical books, appreciating thangkas is a comfortable and enjoyable way for people to converse with numerous ancient people, imagine past historical events, appreciate the rich spiritual world of Tibetans, and understand unique Tibetan customs.
+2.So the main purpose of this passage is (A) "Thangka can provide a comfortable and easy way for everyone to understand Xizang".
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:“知人论世”作为一种文学批评的原则和方法,最早由战国时期的思想家孟子提出.孟子认为,后人要交结古人,只是读其诗书是不行的,还必须了解他们的为人行事以及他们的生活的时代,这样,才能读懂古人的诗书,才能和古人心契神交,成为知音. 对这段话的理解,不正确的是?
+(A) 人的心灵是可以互通和共鸣的
+(B) “知人论世”作为一种文学评论发沿用至今并显现了强大的生命力
+(C) “知人论世”可以帮助后人交结古人和古人成为知音
+(D) 了解古人和他所处的时代,有助于理解他的作品
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: As a principle and method of literary criticism, the concept of "knowing people and discussing the world" was first proposed by Mencius, a thinker of the Warring States period. According to Mencius, in order to make friends with the ancients, it is not enough just to read their poems and books, but also to understand their behavior and the times in which they lived, so as to read and understand their poems and books, and to make friends with them and become their soulmates. What is the incorrect understanding of this passage?
+(A) People's hearts can communicate and resonate with each other.
+(B) "Knowing people and discussing the world" has been used as a kind of literary criticism and has shown a strong vitality.
+(C) "Knowing people and discussing the world" can help the descendants to make friends with the ancients and the ancients to become their soulmates.
+(D) Knowing about the ancients and their times helps to understand their works.
+Step-by-step answer:
+1.From this passage, we cannot see (B) that "understanding people and discussing the world" as a literary criticism has been used to this day and has shown strong vitality.
+2.Although "knowing people and discussing the world" was indeed proposed by the philosopher Mencius during the Warring States period as a principle and method of literary criticism, this passage does not mention that "knowing people and discussing the world" is still in use today, or that it has shown strong vitality.
+3.Therefore, option (B) is an incorrect understanding.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt
new file mode 100644
index 00000000..0bc47b68
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt
@@ -0,0 +1,26 @@
+请理解题目含义并选择正确答案。
+
+Q:有些广东人不爱吃辣椒.因此,有些南方人不爱吃辣椒. 以下哪项能保证上述论证的成立?
+(A) 有些广东人爱吃辣椒
+(B) 爱吃辣椒的有些是南方人
+(C) 所有的广东人都是南方人
+(D) 有些广东人不爱吃辣椒也不爱吃甜食
+A:让我们一步一步来思考。
+在这个论证中,我们从"有些广东人不爱吃辣椒"推断出"有些南方人不爱吃辣椒"。这个推理的成立依赖于广东人和南方人的关系。为了使这个推理有效,我们需要保证至少一部分广东人是南方人。因此,选项 (C) "所有的广东人都是南方人" 是能保证这个论证成立的。所以答案是(C)。
+
+
+Q:唐卡是极富藏族文化特色的一种绘画形式,自吐蕃王朝兴起至今已有1300多年的历史,是雪域高原的文化瑰宝.它的题材除宗教外,还有历史和民俗内容,故又被称为了解西藏的“百科全书”.所以,想要了解西藏的历史,除了正襟危坐地阅读严谨但略显呆板的史书外,你还可以选择一种惬意和愉悦的方式--欣赏唐卡,与众多的古人对话,想象曾经的历史事件,体味藏族人丰富的精神世界,了解独特的藏族民俗,这是一个让历史变得立体可感的过程. 这段文字意在说明:
+(A) 唐卡可以给大家提供一种惬意轻松的了解西藏的方式
+(B) 唐卡中记录了独特的藏族民俗和曾经的历史事件
+(C) 唐卡是了解西藏文化和历史的“百科全书”式的绘画形式
+(D) 唐卡是极富藏族文化特色且历史悠久的一种绘画形式
+A:让我们一步一步来思考。
+文中明确提到了,除了阅读严谨但略显呆板的史书外,欣赏唐卡是一种惬意和愉悦的方式,可以让人与众多的古人对话,想象曾经的历史事件,体味藏族人丰富的精神世界,了解独特的藏族民俗。所以这段文字的主要意图是 (A) "唐卡可以给大家提供一种惬意轻松的了解西藏的方式"。所以答案是(A)。
+
+Q:“知人论世”作为一种文学批评的原则和方法,最早由战国时期的思想家孟子提出.孟子认为,后人要交结古人,只是读其诗书是不行的,还必须了解他们的为人行事以及他们的生活的时代,这样,才能读懂古人的诗书,才能和古人心契神交,成为知音. 对这段话的理解,不正确的是?
+(A) 人的心灵是可以互通和共鸣的
+(B) “知人论世”作为一种文学评论发沿用至今并显现了强大的生命力
+(C) “知人论世”可以帮助后人交结古人和古人成为知音
+(D) 了解古人和他所处的时代,有助于理解他的作品
+A:让我们一步一步来思考。
+从这段话中,我们看不到(B)“知人论世”作为一种文学批评已经沿用至今,并显示出强大的生命力。虽然“知人论世”确实是战国时期哲学家孟子提出的一种文学批评的原则和方法,但这段话并没有提到“知人论世”在今天仍在使用,也没有提到它已经显示出强大的生命力。因此,选项(B)是一种错误的理解。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt
new file mode 100644
index 00000000..df407ea6
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt
@@ -0,0 +1,22 @@
+根据上下文选择正确答案
+
+Q: 下列人物按时间先后顺序排序正确的是?选项:
+(A) 秦始皇、诸葛亮、刘邦、白居易
+(B) 诸葛亮、秦始皇、刘邦、白居易
+(C) 秦始皇、刘邦、诸葛亮、白居易
+(D) 白居易、诸葛亮、刘邦、秦始皇
+A:(C)
+
+Q:下列描述年龄的词语按照年龄从小到大的顺序排序正确的是?选项:
+(A) 不惑、而立、知天命、花甲
+(B) 而立、不惑、知天命、花甲
+(C) 花甲、知天命、而立、不惑
+(D) 而立、花甲、不惑、知天命
+A:(B)
+
+Q:下列制作老式棒棒糖的步骤正确的是?选项:
+(A) 准备材料、将糖浆倒入模具、制作糖浆、冷却定型
+(B) 准备材料、制作糖浆、将糖浆倒入模具、冷却定型
+(C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆
+(D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具
+A:(B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..6494630a
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt
@@ -0,0 +1,25 @@
+根据上下文选择正确答案
+
+Q: 下列人物按时间先后顺序排序正确的是?选项:
+(A) 秦始皇、诸葛亮、刘邦、白居易
+(B) 诸葛亮、秦始皇、刘邦、白居易
+(C) 秦始皇、刘邦、诸葛亮、白居易
+(D) 白居易、诸葛亮、刘邦、秦始皇
+A: Let's think step by step.
+There are four characters mentioned in the options, among which Qin Shi Huang is from the Qin Dynasty, Zhuge Liang is from the Three Kingdoms period, Liu Bang is from the Han Dynasty period, and Bai Juyi is from the Tang Dynasty period. They are sorted in chronological order as Qin Dynasty, Han Dynasty, Three Kingdoms period, and Tang Dynasty. Therefore, the characters are sorted in chronological order as Qin Shi Huang, Liu Bang, Zhuge Liang, and Bai Juyi. So the answer is (C).
+
+Q:下列描述年龄的词语按照年龄从小到大的顺序排序正确的是?选项:
+(A) 不惑、而立、知天命、花甲
+(B) 而立、不惑、知天命、花甲
+(C) 花甲、知天命、而立、不惑
+(D) 而立、花甲、不惑、知天命
+A: Let's think step by step.
+The options mention four words that describe age, among which "Erli" refers to 30 years old, "Bu Fu" refers to 40 years old, "Zhi Tian Ming" refers to 50 years old, and "Hua Jia" refers to 60 years old. Therefore, in order of age, they are Erli, Bu Fu, Zhi Tian Ming, and Hua Jia. So the answer is (B).
+
+Q:下列制作老式棒棒糖的步骤正确的是?选项:
+(A) 准备材料、将糖浆倒入模具、制作糖浆、冷却定型
+(B) 准备材料、制作糖浆、将糖浆倒入模具、冷却定型
+(C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆
+(D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具
+A: Let's think step by step.
+The title mentions the steps to make old-fashioned lollipops, and the options include "preparing materials", "pouring syrup into the mold", "making syrup", and "cooling and shaping". According to the steps to make old-fashioned lollipops, the first step should be to prepare the materials, then make syrup, pour syrup into the mold, and finally cool and shape. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt
new file mode 100644
index 00000000..11ed9c31
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt
@@ -0,0 +1,62 @@
+根据上下文选择正确答案
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request: 下列人物按时间先后顺序排序正确的是?选项:
+(A) 秦始皇、诸葛亮、刘邦、白居易
+(B) 诸葛亮、秦始皇、刘邦、白居易
+(C) 秦始皇、刘邦、诸葛亮、白居易
+(D) 白居易、诸葛亮、刘邦、秦始皇
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: The following characters are correctly ordered in chronological order? Options:
+(A) Qin Shi Huang, Zhuge Liang, Liu Bang, Bai Juyi
+(B) Zhuge Liang, Qin Shi Huang, Liu Bang, Bai Ju Yi
+(C) Qin Shi Huang, Liu Bang, Zhu Geliang, Bai Juyi
+(D) Bai Juyi, Zhu Geliang, Liu Bang, Qin Shi Huang
+Step-by-step answer:
+1.There are four characters mentioned in the options, among which Qin Shi Huang is from the Qin Dynasty, Zhuge Liang is from the Three Kingdoms period, Liu Bang is from the Han Dynasty period, and Bai Juyi is from the Tang Dynasty period.
+2.They are sorted in chronological order as Qin Dynasty, Han Dynasty, Three Kingdoms period, and Tang Dynasty.
+3.Therefore, the characters are sorted in chronological order as Qin Shi Huang, Liu Bang, Zhuge Liang, and Bai Juyi.
+So the answer is (C).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request: 下列描述年龄的词语按照年龄从小到大的顺序排序正确的是?选项:
+(A) 不惑、而立、知天命、花甲
+(B) 而立、不惑、知天命、花甲
+(C) 花甲、知天命、而立、不惑
+(D) 而立、花甲、不惑、知天命
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: The following words describing age are correctly ordered from youngest to oldest age? Options:
+(A) Unconfused, Established, Knowledge of life, Flowering age
+(B) To stand, not to be confused, to know one's destiny, and to be in the prime of life.
+(C) Being in the prime of life, knowing one's destiny, being in the prime of life, not being confused.
+(D) to stand up, to grow old, to be unperturbed, to know one's destiny
+Step-by-step answer:
+1.The options mention four words that describe age, among which "Erli" refers to 30 years old, "Bu Fu" refers to 40 years old, "Zhi Tian Ming" refers to 50 years old, and "Hua Jia" refers to 60 years old.
+2.Therefore, in order of age, they are Erli, Bu Fu, Zhi Tian Ming, and Hua Jia.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request: 下列制作老式棒棒糖的步骤正确的是?选项:
+(A) 准备材料、将糖浆倒入模具、制作糖浆、冷却定型
+(B) 准备材料、制作糖浆、将糖浆倒入模具、冷却定型
+(C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆
+(D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Which of the following steps is correct for making old-fashioned lollipops? Options:
+(A) Preparing the ingredients, pouring the syrup into the molds, making the syrup, cooling to set the shape
+(B) Prepare ingredients, make syrup, pour syrup into molds, cool to set
+(C) Prepare ingredients, pour syrup into mold, cool and set, make syrup
+(D) Prepare ingredients, cool and set, make syrup, pour syrup into molds
+Step-by-step answer:
+1.The title mentions the steps to make old-fashioned lollipops, and the options include "preparing materials", "pouring syrup into the mold", "making syrup", and "cooling and shaping".
+2.According to the steps to make old-fashioned lollipops, the first step should be to prepare the materials, then make syrup, pour syrup into the mold, and finally cool and shape.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..c30ac670
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt
@@ -0,0 +1,25 @@
+根据上下文选择正确答案
+
+Q: 下列人物按时间先后顺序排序正确的是?选项:
+(A) 秦始皇、诸葛亮、刘邦、白居易
+(B) 诸葛亮、秦始皇、刘邦、白居易
+(C) 秦始皇、刘邦、诸葛亮、白居易
+(D) 白居易、诸葛亮、刘邦、秦始皇
+A:让我们一步一步来思考。
+选项中提到了4个人物,其中秦始皇是秦朝的,诸葛亮是三国时期的,刘邦是汉朝时期的,白居易是唐朝时期的,按照时间先后排序为秦朝,汉朝,三国时期,唐朝,因此人物先后排序为秦始皇、刘邦、诸葛亮、白居易。所以答案是(C)。
+
+Q:下列描述年龄的词语按照年龄从小到大的顺序排序正确的是?选项:
+(A) 不惑、而立、知天命、花甲
+(B) 而立、不惑、知天命、花甲
+(C) 花甲、知天命、而立、不惑
+(D) 而立、花甲、不惑、知天命
+A:让我们一步一步来思考。
+选项中提到了四个描述年龄的词语,其中“而立”指30岁,“不惑”指40岁,“知天命”指50岁,“花甲”指60岁,因此按年龄从小到大排序是而立、不惑、知天命、花甲。所以答案是(B)。
+
+Q:下列制作老式棒棒糖的步骤正确的是?选项:
+(A) 准备材料、将糖浆倒入模具、制作糖浆、冷却定型
+(B) 准备材料、制作糖浆、将糖浆倒入模具、冷却定型
+(C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆
+(D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具
+A:让我们一步一步来思考。
+题目中提到了制作老式棒棒糖的步骤,选项中包含“准备材料”、“将糖浆倒入模具”、“制作糖浆”、“冷却定型”四个步骤,按照制作老式棒棒糖的步骤第一步应该是准备材料,然后需要制作糖浆,再把糖浆倒入模具,最后冷却定型。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt
new file mode 100644
index 00000000..344eb9f2
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt
@@ -0,0 +1,19 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "运动员张培萌一传稳定如山,使得进攻端占据着主动"
+选项:
+(A)可信
+(B)不可信
+A: (B)
+
+Q: 下面的句子可信吗? "运动员郭艾伦打入任意球"
+选项:
+(A)可信
+(B)不可信
+A: (B)
+
+Q: 下面的句子可信吗? "运动员范智毅射门得分"
+选项:
+(A)可信
+(B)不可信
+A: (A)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..2f8f3611
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt
@@ -0,0 +1,22 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "运动员张培萌一传稳定如山,使得进攻端占据着主动"
+选项:
+(A)可信
+(B)不可信
+A: Let's think step by step.
+Zhang Peimeng is an athlete from China who competes in track and field. First pass is a term in volleyball. Since he is not a volleyball player, he won't be involved in the passing part of a volleyball game. So the answer is (B).
+
+Q: 下面的句子可信吗? "运动员郭艾伦打入任意球"
+选项:
+(A)可信
+(B)不可信
+A: Let's think step by step.
+Guo Ailun is a Chinese basketball player, and "free kick" is a term used in soccer. Since he is not a soccer player, he won't be involved in the free kick phase of a soccer match. Therefore, So the answer is (B).
+
+Q: 下面的句子可信吗? "运动员范智毅射门得分"
+选项:
+(A)可信
+(B)不可信
+A: Let's think step by step.
+Fan Zhiyi is a Chinese football player, and "scoring a goal" is a term used in football. Since he is a professional football player, he is very likely to score a goal. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_XLT.txt
new file mode 100644
index 00000000..efae4772
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_XLT.txt
@@ -0,0 +1,56 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "运动员张培萌一传稳定如山,使得进攻端占据着主动"
+选项:
+(A)可信
+(B)不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Is the following sentence credible? "Zhang Peimeng's pass was as stable as a mountain, allowing the attacking end to take the initiative."
+Option:
+(A) Credible
+(B) Not credible
+Step-by-step answer:
+1.Zhang Peimeng is an athlete from China who competes in track and field. First pass is a term in volleyball.
+2.Since he is not a volleyball player, he won't be involved in the passing part of a volleyball game. So the answer is (B).
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "运动员郭艾伦打入任意球"
+选项:
+(A)可信
+(B)不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Is the following sentence credible? "Guo Ailun scored a free kick"
+Option:
+(A) Credible
+(B) Not credible
+Step-by-step answer:
+1.Guo Ailun is a Chinese basketball player, and "free kick" is a term used in soccer.
+2.Since he is not a soccer player, he won't be involved in the free kick phase of a soccer match.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "运动员范智毅射门得分"
+选项:
+(A)可信
+(B)不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Is the following sentence credible? "Fan Zhiyi's shooting score"
+Option:
+(A) Credible
+(B) Not credible
+Step-by-step answer:
+1.Fan Zhiyi is a Chinese football player, and "scoring a goal" is a term used in football.
+2.Since he is a professional football player, he is very likely to score a goal.
+So the answer is (A).
+
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..7db7b53f
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt
@@ -0,0 +1,22 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "运动员张培萌一传稳定如山,使得进攻端占据着主动"
+选项:
+(A)可信
+(B)不可信
+A: 让我们一步一步来思考。
+张培萌是一位中国的田径运动员,一传是排球运动术语。由于他并不是排球运动员,因此他不会参与到排球比赛中的传球环节。所以答案是(B)。
+
+Q: 下面的句子可信吗? "运动员郭艾伦打入任意球"
+选项:
+(A)可信
+(B)不可信
+A: 让我们一步一步来思考。
+郭艾伦是一位中国的篮球运动员,任意球是足球运动术语。由于他并不是足球运动员,因此他不会在参与到足球比赛的任意球环节,所以答案是(B)。
+
+Q: 下面的句子可信吗? "运动员范智毅射门得分"
+选项:
+(A)可信
+(B)不可信
+A: 让我们一步一步来思考。
+范智毅是一位中国的足球运动员,射门得分是足球运动术语。由于他是一名专业的足球运动员,因此他很可能射门得分,所以答案是(A)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt
new file mode 100644
index 00000000..cf058dc8
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt
@@ -0,0 +1,25 @@
+根据上下文选择正确答案。
+
+Q:小华在丙申年出生,他的哥哥比他大6岁,所以他的哥哥出生在哪一年?
+选项:
+(A) 己卯年
+(B) 庚寅年
+(C) 丙申年
+(D) 乙丑年
+A:(B)
+
+Q:如果今年是甲辰年,李华的爷爷说:“今年是我的知天命之年”,请问赵婷爷爷的属相是什么?
+选项:
+(A) 狗
+(B) 虎
+(C) 鸡
+(D) 鼠
+A:(B)
+
+Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才能完成。那么他最早在什么时辰可以完成?
+选项:
+ (A) 辰时
+ (B) 卯时
+ (C) 午时
+ (D) 未时
+A:(A)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..9f5efa0d
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt
@@ -0,0 +1,28 @@
+根据上下文选择正确答案。
+
+Q:小华在丙申年出生,他的哥哥比他大6岁,所以他的哥哥出生在哪一年?
+选项:
+(A) 己卯年
+(B) 庚寅年
+(C) 丙申年
+(D) 乙丑年
+A:Let's think step by step.
+Xiaohua was born in the year of Bingshen, and his older brother was 6 years older than him. The sixth year before Bingshen was in the year of Gengyin, so his older brother was born in the year of Gengyin. So the answer is (B).
+
+Q:如果今年是甲辰年,李华的爷爷说:“今年是我的知天命之年”,请问赵婷爷爷的属相是什么?
+选项:
+(A) 狗
+(B) 虎
+(C) 鸡
+(D) 鼠
+A:Let's think step by step.
+The title mentions that Grandpa was born in the year of Jiayin, which is the year of the Tiger. In ancient China, the term "year of knowing the destiny of heaven" referred to the age of 50. Therefore, Grandpa is 50 years old this year, which is the year of Jiachen. According to the Chinese Tiangan Dizhi chronology, the year of Grandpa's birth is the year of Jiayin, which is the year of the Tiger. Therefore, Grandpa belongs to the Year of the Tiger. So the answer is (B).
+
+Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才能完成。那么他最早在什么时辰可以完成?
+选项:
+ (A) 辰时
+ (B) 卯时
+ (C) 午时
+ (D) 未时
+A:Let's think step by step.
+According to the ancient Chinese timing method, the third quarter of the pig hour refers to approximately 21:45 minutes, and 10 hours later it is 7:45 minutes, which is the Dragon Hour . So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt
new file mode 100644
index 00000000..b3fa5635
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt
@@ -0,0 +1,68 @@
+根据上下文选择正确答案。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:小华在丙申年出生,他的哥哥比他大6岁,所以他的哥哥出生在哪一年?
+选项:
+(A) 己卯年
+(B) 庚寅年
+(C) 丙申年
+(D) 乙丑年
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Xiaohua was born in the year of Bingshen, and his brother is 6 years older than him, so in which year was his brother born?
+Option:
+(A) Ji Mao Year
+(B) Gengyin Year
+(C) Bingshen Year
+(D) Yi Chou Year
+Step-by-step answer:
+1.Xiaohua was born in the year of Bingshen, and his older brother is 6 years older than him. According to the Chinese Tian Gan Di Zhi chronology, the sixth year before Bingshen is the year of Gengyin.
+2.So his brother was born in the year of Gengyin.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:如果今年是甲辰年,李华的爷爷说:“今年是我的知天命之年”,请问赵婷爷爷的属相是什么?
+选项:
+(A) 狗
+(B) 虎
+(C) 鸡
+(D) 鼠
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: If this year is the year of Jiachen and Li Hua's grandfather says, "This year is my year of knowing fate." What is the zodiac sign of Grandpa Zhao Ting?
+Option:
+(A) Dogs
+(B) Tiger
+(C) Chicken
+(D) Mouse
+Step-by-step answer:
+1.The title mentions that Grandpa is the "year of knowing destiny", and in ancient China, "year of knowing destiny" referred to the age of 50, so Grandpa is 50 years old this year.
+2.This year is the year of Jiachen. According to the Chinese Tiangan and Dizhi chronology, the year my grandfather was born is the year of Jiayin.
+3.The year of Jiayin is the year of the tiger, so my grandfather belongs to the tiger.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:小李在亥时三刻开始制作画画,他知道他需要10个小时才能完成。那么他最早在什么时辰可以完成?
+选项:
+ (A) 辰时
+ (B) 卯时
+ (C) 午时
+ (D) 未时
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Xiao Li started making paintings at the third quarter of the pig hour, and he knew it would take him 10 hours to complete. So when can he finish it at the earliest?
+Option:
+(A) Dragon Hour
+(B) Rabbit Hour
+(C) Horse Hour
+(D) Sheep Hour
+Step-by-step answer:
+1.According to the ancient Chinese timing method, the third quarter of the hour of Hai refers to approximately 21:45,
+2.Ten hours later, it will be 7:45, which is Dragon Hour .
+So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..f24019d1
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt
@@ -0,0 +1,28 @@
+根据上下文选择正确答案。
+
+Q:小华在丙申年出生,他的哥哥比他大6岁,所以他的哥哥出生在哪一年?
+选项:
+(A) 己卯年
+(B) 庚寅年
+(C) 丙申年
+(D) 乙丑年
+A:让我们一步一步来思考。
+小华在丙申年出生,他的哥哥比他大6岁,按照中国天干地支纪年法,丙申年之前的第六年是庚寅年,所以他的哥哥在庚寅年出生。所以答案是(B)。
+
+Q:如果今年是甲辰年,李华的爷爷说:“今年是我的知天命之年”,请问赵婷爷爷的属相是什么?
+选项:
+(A) 狗
+(B) 虎
+(C) 鸡
+(D) 鼠
+A:让我们一步一步来思考。
+题目中提到爷爷是“知天命之年”,中国古代“知天命之年”代指50岁,所以爷爷今年50岁,今年是甲辰年,根据中国天干地支纪年法推算,爷爷出生那年是甲寅年,甲寅年是虎年,所以爷爷属虎。所以答案是(B)。
+
+Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才能完成。那么他最早在什么时辰可以完成?
+选项:
+ (A) 辰时
+ (B) 卯时
+ (C) 午时
+ (D) 未时
+A:让我们一步一步来思考。
+按照中国古代计时方法,亥时三刻大约指的是21点45分,10个小时后是7点45分,是辰时。所以答案是(A)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt
new file mode 100644
index 00000000..f388d385
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt
@@ -0,0 +1,22 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着收音机说话。
+选项:
+(A) 是
+(B) 否
+A:(B)
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着他的 iPhone 说话。
+选项:
+(A) 是
+(B) 否
+A:(A)
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+没有什么比萨莉·海明斯边看 HBO 的《真探》边织毛衣更令人满足。
+选项:
+(A) 是
+(B) 否
+A:(A)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt
new file mode 100644
index 00000000..791bd909
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt
@@ -0,0 +1,25 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着收音机说话。
+选项:
+(A) 是
+(B) 否
+A: Let's think step by step.
+The statement mentions “the Allied bombardment of the beaches of Iwo Jima,” which refers to a historical event during World War II. The use of radios for communication among military personnel during that time is accurate and appropriate. So the answer is (B).
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着他的 iPhone 说话。
+选项:
+(A) 是
+(B) 否
+A: Let's think step by step.
+The statement mentions “the Allied bombardment of the beaches of Iwo Jima,” which refers to a historical event during World War II. However, the mention of Ralph speaking loudly into his iPhone introduces an anachronism.The iPhone is a modern-day smartphone that was not available during the time of the Allied bombardment of Iwo Jima in 1945. So the answer is (A).
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+没有什么比萨莉·海明斯边看 HBO 的《真探》边织毛衣更令人满足。
+选项:
+(A) 是
+(B) 否
+A: Let's think step by step.
+The statement mentions Sally Hemings, who was an enslaved woman in the United States during the late 18th and early 19th centuries. However, the mention of watching HBO’s True Detective, which is a modern television show, introduces an anachronism. During Sally Hemings’ time, television did not exist, and the specific mention of watching a specific show like True Detective is clearly out of place for that historical period. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt
new file mode 100644
index 00000000..bd4e898b
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt
@@ -0,0 +1,61 @@
+识别给定陈述是否包含时代错误。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着收音机说话。
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:How would a person with compulsory education and basic common sense answer whether the following statement contains an anachronism?
+During the landing operations on Iwo Jima, Ralph spoke loudly into the radio.
+Options:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.The statement mentions “the Allied bombardment of the beaches of Iwo Jima,” which refers to a historical event during World War II.
+2.The use of radios for communication among military personnel during that time is accurate and appropriate.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着他的 iPhone 说话。
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:How would a person with compulsory education and basic common sense answer whether the following statement contains an anachronism?
+During the landing operations on Iwo Jima, Ralph spoke loudly into the radio.
+Options:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.The statement mentions “the Allied bombardment of the beaches of Iwo Jima,” which refers to a historical event during World War II.
+2.However, the mention of Ralph speaking loudly into his iPhone introduces an anachronism.The iPhone is a modern-day smartphone that was not available during the time of the Allied bombardment of Iwo Jima in 1945.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+没有什么比萨莉·海明斯边看 HBO 的《真探》边织毛衣更令人满足。
+选项:
+(A) 是
+(B) 否
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:How would a person with compulsory education and basic common sense answer whether the following statement contains an anachronism?
+During the landing operations on Iwo Jima, Ralph spoke loudly into the radio.
+Options:
+(A) Yes
+(B) No
+Step-by-step answer:
+1.The statement mentions Sally Hemings, who was an enslaved woman in the United States during the late 18th and early 19th centuries.
+2.However, the mention of watching HBO’s True Detective, which is a modern television show, introduces an anachronism. During Sally Hemings’ time, television did not exist, and the specific mention of watching a specific show like True Detective is clearly out of place for that historical period.
+So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt
new file mode 100644
index 00000000..2401b0ea
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt
@@ -0,0 +1,25 @@
+识别给定陈述是否包含时代错误。
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着收音机说话。
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了“硫磺岛登陆作战期间”,这是指二战期间的一个历史事件。在当时,军事人员之间使用收音机进行通信是准确和恰当的。所以答案是(B)。
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+在硫磺岛登陆作战期间,拉尔夫大声对着他的 iPhone 说话。
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了“硫磺岛登陆作战期间”,这是指二战期间的一个历史事件。然而,提到拉尔夫大声对着他的 iPhone 说话引入了一个时代错误。iPhone 是现代的智能手机,在 1945 年硫磺岛登陆作战期间是不存在的。所以答案是(A)。
+
+Q:以下陈述是否包含时代错误,一个接受了义务教育、具备基本常识的人会如何回答?
+没有什么比萨莉·海明斯边看 HBO 的《真探》边织毛衣更令人满足。
+选项:
+(A) 是
+(B) 否
+A:让我们一步一步来思考。
+这个陈述提到了萨莉·海明斯,她是 18 世纪末到 19 世纪初美国的一个被奴役的女性。然而,提到她边看 HBO 的《真探》边织毛衣引入了一个时代错误。在萨莉·海明斯所处的时代,电视是不存在的,而且具体提到观看像《真探》这样的特定节目在那个历史时期显然是不合适的。所以答案是(A)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt
new file mode 100644
index 00000000..a38c8e33
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt
@@ -0,0 +1,25 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《辛德勒的名单》、《阿波罗13号》类似的电影:
+选项:
+(A)《星际迷航》
+(B)《我盛大的希腊婚礼2》
+(C)《圣诞老人2》
+(D)《与狼共舞》
+A: (D)
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《阿波罗13号》、《与狼共舞》类似的电影:
+选项:
+(A)《蝙蝠侠:突袭阿卡姆》
+(B)《肖申克的救赎》
+(C)《玩具总动员》
+(D)《狮子王》
+A: (B)
+
+Q: 寻找一部与《惊世骇案》、《勇敢的心》、《低俗小说》、《辛德勒的名单》类似的电影:
+选项:
+(A)《卡里加里博士的小屋》
+(B)《肖申克的救赎》
+(C)《蜘蛛侠2》
+(D)《出租车》
+A: (B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt
new file mode 100644
index 00000000..b842e86b
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt
@@ -0,0 +1,40 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《辛德勒的名单》、《阿波罗13号》类似的电影:
+选项:
+(A)《星际迷航》
+(B)《我盛大的希腊婚礼2》
+(C)《圣诞老人2》
+(D)《与狼共舞》
+A: Let's think step by step.
+Star Trek is a science fiction film that, despite its depth and complexity, has significant differences in theme and style from the four aforementioned films.
+My Grand Greek Wedding 2 is a light hearted comedy film that differs significantly from the themes and styles of the four aforementioned films.
+Santa Claus 2 is a family movie with a Christmas theme, which differs significantly from the themes and styles of the four aforementioned movies.
+"Dancing with Wolves" is a film that depicts the relationship between Native Americans and the American West during its pioneering period
+The theme and style of "Dancing with Wolves" are consistent with the four films mentioned above. This movie, like Brave Heart, Pretty Woman, Schindler's List, and Apollo 13, is a historical film with depth and seriousness. So the answer is (D).
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《阿波罗13号》、《与狼共舞》类似的电影:
+选项:
+(A)《蝙蝠侠:突袭阿卡姆》
+(B)《肖申克的救赎》
+(C)《玩具总动员》
+(D)《狮子王》
+A: Let's think step by step.
+Batman: Assault on Arkham is a superhero film with significant differences in theme and style from the four aforementioned films.
+Shawshank Redemption is a 1994 American drama film directed by Frank Delabond and starring Tim Robbins and Morgan Freeman. It is a film about hope and perseverance.
+Toy Story is an animated film, although it may have some themes of adventure and friendship, its themes and style differ significantly from the four aforementioned films.
+Although Lion King is a classic animated film that covers themes of courage and growth, its themes and style differ significantly from the four aforementioned films.
+The Shawshank Redemption, like Brave Heart, Pretty Woman, Apollo 13, and Dancing with Wolves, is a film with depth and seriousness, and its theme and style are similar to the other three films. So the answer is (B).
+
+Q: 寻找一部与《惊世骇案》、《勇敢的心》、《低俗小说》、《辛德勒的名单》类似的电影:
+选项:
+(A)《卡里加里博士的小屋》
+(B)《肖申克的救赎》
+(C)《蜘蛛侠2》
+(D)《出租车》
+A: Let's think step by step.
+"Dr. Caligary's Cabin" is a 1920 German expressionist silent film directed by Robert Wiener. This film is often considered a milestone in German expressionist cinema and one of the earliest horror films.
+Shawshank Redemption is a 1994 American drama film directed by Frank Delabond and starring Tim Robbins and Morgan Freeman. It is a film about hope and perseverance.
+Spider Man 2 is a 2004 American superhero film directed by Sam Remy and starring Toby Maguire. It is the second installment of the Spider Man trilogy.
+"Taxi" is a 2004 American comedy action film directed by Tim Storey, starring Jimmy Flanders and Quentin Latafa. This movie is an American remake of a 1998 French film.
+And the titles of "The Amazing Case", "Brave Heart", "Pulp Fiction", and "Schindler's List" are all very profound, plot rich, and have strong human themes in movies. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances, and profound themes. The Shawshank Redemption tells the story of a wrongly accused banker who maintains hope in prison and ultimately escapes. The plot of this movie is deeply ingrained in people's hearts, with a profound portrayal of human nature, and there are many similarities with the movie in the title. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt
new file mode 100644
index 00000000..cdcaf0e8
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt
@@ -0,0 +1,76 @@
+给根据给定艺术作品清单,找出最类似的。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:寻找一部与《勇敢的心》、《风月俏佳人》、《辛德勒的名单》、《阿波罗13号》类似的电影:
+选项:
+(A)《星际迷航》
+(B)《我盛大的希腊婚礼2》
+(C)《圣诞老人2》
+(D)《与狼共舞》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Find a movie similar to Braveheart, Pretty Woman, Schindler's List, Apollo 13:
+Options:
+(A) Star Trek
+(B) My Big Fat Greek Wedding 2
+(C) The Santa Clause 2
+(D) Dances with Wolves
+Step-by-step answer:
+1.Star Trek is a science fiction film that, despite its depth and complexity, has significant differences in theme and style from the four aforementioned films.
+2.My Big Fat Greek Wedding 2 is a light hearted comedy film that differs significantly from the themes and styles of the four aforementioned films.
+3.Santa Claus 2 is a family movie with a Christmas theme, which differs significantly from the themes and styles of the four aforementioned movies.
+4.Dancing with Wolves is a film that depicts the relationship between Native Americans and the American West during its pioneering period
+5.The theme and style of "Dancing with Wolves" are consistent with the four films mentioned above. This movie, like Brave Heart, Pretty Woman, Schindler's List, and Apollo 13, is a historical film with depth and seriousness.
+So the answer is (D).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:寻找一部与《勇敢的心》、《风月俏佳人》、《阿波罗13号》、《与狼共舞》类似的电影:
+选项:
+(A)《蝙蝠侠:突袭阿卡姆》
+(B)《肖申克的救赎》
+(C)《玩具总动员》
+(D)《狮子王》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Find a movie similar to Braveheart, Pretty Woman, Apollo 13, Dances with Wolves:
+Options:
+(A) Batman Assault on Arkham
+(B) The Shawshank Redemption
+(C) Toy Story
+(D) The Lion King
+Step-by-step answer:
+1.Batman: Assault on Arkham is a superhero film with significant differences in theme and style from the four aforementioned films.
+2.Shawshank Redemption is a 1994 American drama film directed by Frank Delabond and starring Tim Robbins and Morgan Freeman. It is a film about hope and perseverance.
+3.Toy Story is an animated film, although it may have some themes of adventure and friendship, its themes and style differ significantly from the four aforementioned films.
+4.Although Lion King is a classic animated film that covers themes of courage and growth, its themes and style differ significantly from the four aforementioned films.
+5.The Shawshank Redemption, like Brave Heart, Pretty Woman, Apollo 13, and Dancing with Wolves, is a film with depth and seriousness, and its theme and style are similar to the other three films.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:寻找一部与《惊世骇案》、《勇敢的心》、《低俗小说》、《辛德勒的名单》类似的电影:
+选项:
+(A)《卡里加里博士的小屋》
+(B)《肖申克的救赎》
+(C)《蜘蛛侠2》
+(D)《出租车》
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Find a movie similar to The Usual Suspects, Braveheart, Pulp Fiction, Schindler's List:
+Options:
+(A) The Cabinet of Dr Caligari
+(B) The Shawshank Redemption
+(C) Spider-Man 2
+(D) Taxi
+Step-by-step answer:
+1."Dr. Caligary's Cabin" is a 1920 German expressionist silent film directed by Robert Wiener. This film is often considered a milestone in German expressionist cinema and one of the earliest horror films.
+2.Shawshank Redemption is a 1994 American drama film directed by Frank Delabond and starring Tim Robbins and Morgan Freeman. It is a film about hope and perseverance.
+3.Spider Man 2 is a 2004 American superhero film directed by Sam Remy and starring Toby Maguire. It is the second installment of the Spider Man trilogy.
+4."Taxi" is a 2004 American comedy action film directed by Tim Storey, starring Jimmy Flanders and Quentin Latafa. This movie is an American remake of a 1998 French film.
+5.And the titles of "The Amazing Case", "Brave Heart", "Pulp Fiction", and "Schindler's List" are all very profound, plot rich, and have strong human themes in movies. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances, and profound themes. The Shawshank Redemption tells the story of a wrongly accused banker who maintains hope in prison and ultimately escapes. The plot of this movie is deeply ingrained in people's hearts, with a profound portrayal of human nature, and there are many similarities with the movie in the title.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt
new file mode 100644
index 00000000..70abbad2
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt
@@ -0,0 +1,40 @@
+给根据给定艺术作品清单,找出最类似的。
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《辛德勒的名单》、《阿波罗13号》类似的电影:
+选项:
+(A)《星际迷航》
+(B)《我盛大的希腊婚礼2》
+(C)《圣诞老人2》
+(D)《与狼共舞》
+A: 让我们一步一步来思考。
+《星际迷航》是一部科幻电影,虽然它具有深度和复杂性,但它的主题和风格与上述四部电影有较大的差异。
+《我盛大的希腊婚礼2》是一部轻松的喜剧电影,与上述四部电影的主题和风格有较大的差异。
+《圣诞老人2》是一部圣诞主题的家庭电影,与上述四部电影的主题和风格也有较大的差异。
+《与狼共舞》是一部描绘美国西部开拓时期与原住民关系的电影.
+其中《与狼共舞》的主题和风格与上述四部电影相符。这部电影与《勇敢的心》、《风月俏佳人》、《辛德勒的名单》、《阿波罗13号》一样,都是历史题材的电影,具有深度和严肃性。所以答案是(D)。
+
+Q: 寻找一部与《勇敢的心》、《风月俏佳人》、《阿波罗13号》、《与狼共舞》类似的电影:
+选项:
+(A)《蝙蝠侠:突袭阿卡姆》
+(B)《肖申克的救赎》
+(C)《玩具总动员》
+(D)《狮子王》
+A: 让我们一步一步来思考。
+《蝙蝠侠:突袭阿卡姆》是一部超级英雄电影,它的主题和风格与上述四部电影有较大的差异。
+《肖申克的救赎》是一部1994年的美国剧情电影,由弗兰克·德拉邦特执导,蒂姆·罗宾斯和摩根·弗里曼主演,这是一部讲述希望和毅力的电影。
+《玩具总动员》是一部动画电影,虽然它可能具有一些冒险和友谊的主题,但其主题和风格与上述四部电影有较大的差异。
+《狮子王》虽然是一部经典的动画电影,涵盖了勇气和成长的主题,但其主题和风格与上述四部电影有较大的差异。
+其中《肖申克的救赎》这部电影与《勇敢的心》、《风月俏佳人》、《阿波罗13号》、《与狼共舞》一样,都是具有深度和严肃性的电影,并且主题和风格与其他三部电影相似。所以答案是(B)。
+
+Q: 寻找一部与《惊世骇案》、《勇敢的心》、《低俗小说》、《辛德勒的名单》类似的电影:
+选项:
+(A)《卡里加里博士的小屋》
+(B)《肖申克的救赎》
+(C)《蜘蛛侠2》
+(D)《出租车》
+A: 让我们一步一步来思考。
+《卡里加里博士的小屋》是一部1920年的德国表现主义默片,由罗伯特·维内执导。这部电影通常被认为是德国表现主义电影的一部里程碑式作品,也是最早的恐怖电影之一。
+《肖申克的救赎》是一部1994年的美国剧情电影,由弗兰克·德拉邦特执导,蒂姆·罗宾斯和摩根·弗里曼主演,这是一部讲述希望和毅力的电影。
+《蜘蛛侠2》是一部2004年的美国超级英雄电影,由萨姆·雷米执导,托比·马奎尔主演,是《蜘蛛侠》三部曲的第二部。
+《出租车》这是一部2004年的美国喜剧动作片,由蒂姆·斯托瑞执导,吉米·福兰和昆汀·拉塔法主演。这部电影是1998年法国电影的美国翻拍版。
+而题目中《惊世骇案》、《勇敢的心》、《低俗小说》和《辛德勒的名单》都是一些非常深刻、情节丰富且具有强烈人性主题的电影。它们都以其出色的剧本、精彩的表演和深刻的主题赢得了观众和评论家的高度赞誉。选项中《肖申克的救赎》讲述了一名被冤枉的银行家如何在监狱中保持希望,并最终逃脱的故事。这部电影的情节深入人心,人性描绘深刻,与题目中的电影有许多相似之处。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt
new file mode 100644
index 00000000..dcac4a19
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt
@@ -0,0 +1,25 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:可是老人小心翼翼将蛇挑开,让它爬向草丛,嘴里念念有词:罪过,罪过,这本来是你的家呀
+语句二:老人心里十分难过。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(A)
+
+Q:语句一:她是一个有着丰满的脸、丰满的嘴唇和大牙齿的黑色爆炸头女人。
+语句二:她喜欢抹红色的口红,穿红色的衣服。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(C)
+
+Q:语句一:你不确定你已经清楚你站着谁的一面。
+语句二:你支持谁,这一点显而易见。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:(B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt
new file mode 100644
index 00000000..202421bc
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt
@@ -0,0 +1,28 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:可是老人小心翼翼将蛇挑开,让它爬向草丛,嘴里念念有词:罪过,罪过,这本来是你的家呀
+语句二:老人心里十分难过。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+The first sentence describes the process of an old man carefully handling a snake's movements. The old man says "sin, sin, sin," indicating that he feels guilty and sad for violating the snake's territory. The second sentence can be inferred, the old man is very sad in his heart. Therefore, the two sentences contain a relationship. So the answer is (A).
+
+Q:语句一:她是一个有着丰满的脸、丰满的嘴唇和大牙齿的黑色爆炸头女人。
+语句二:她喜欢抹红色的口红,穿红色的衣服。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+These two sentences both describe the same woman, but they focus on different characteristics. The first sentence describes her physical characteristics, including face, lips, teeth, and hairstyle. The second sentence describes her aesthetic preferences, including lipstick color and clothing color. These two sentences do not have any obvious implication or contradictory relationship, so we can say that they are unrelated. So the answer is (C).
+
+Q:语句一:你不确定你已经清楚你站着谁的一面。
+语句二:你支持谁,这一点显而易见。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A: Let's think step by step.
+The first sentence indicates that you are not sure who you support, while the second sentence clearly indicates that your position is obvious, which means you are clear about who you support. Therefore, the content of these two sentences is contradictory to each other. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt
new file mode 100644
index 00000000..16b45350
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt
@@ -0,0 +1,69 @@
+请根据题目中两句话的关系选择正确答案。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:可是老人小心翼翼将蛇挑开,让它爬向草丛,嘴里念念有词:罪过,罪过,这本来是你的家呀
+语句二:老人心里十分难过。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1:But the old man carefully picked the snake away from the grass, letting it crawl towards the grass, mumbling: sin, sin, this was your home!
+Sentence 2: The old man was very sad in his heart.
+What is the relationship between these two statements?
+(A) Contained
+(B) contradictory
+(C) Unrelated
+Step-by-step answer:
+1.The first sentence describes the process of an old man carefully handling a snake's movements.
+2.The old man says "sin, sin, sin," indicating that he feels guilty and sad for violating the snake's territory.
+3.The second sentence can be inferred, the old man is very sad in his heart. Therefore, the two sentences contain a relationship.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:她是一个有着丰满的脸、丰满的嘴唇和大牙齿的黑色爆炸头女人。
+语句二:她喜欢抹红色的口红,穿红色的衣服。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1:Sentence 1: She was a black explosive-headed woman with a plump face, full lips and big teeth.
+Sentence 2: She likes to wear red lipstick and red clothes.
+What is the relationship between these two statements?
+(A) implied
+(B) contradictory
+(C) Unrelated
+Step-by-step answer:
+1.These two sentences both describe the same woman, but they focus on different characteristics.
+2.The first sentence describes her physical characteristics, including face, lips, teeth, and hairstyle.
+3.The second sentence describes her aesthetic preferences, including lipstick color and clothing color. These two sentences do not have any obvious implication or contradictory relationship, so we can say that they are unrelated.
+So the answer is (C).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:语句一:你不确定你已经清楚你站着谁的一面。
+语句二:你支持谁,这一点显而易见。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Sentence 1: You're not sure you've made it clear whose side you're on.
+Sentence 2: It is obvious whose side you are on.
+What is the relationship between these two statements?
+(A) Contained
+(B) Contradictory
+(C) Unrelated
+Step-by-step answer:
+1.The first sentence indicates that you are not sure who you support, while the second sentence clearly indicates that your position is obvious, which means you are clear about who you support.
+2.Therefore, the content of these two sentences is contradictory to each other.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt
new file mode 100644
index 00000000..4a01f45e
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt
@@ -0,0 +1,28 @@
+请根据题目中两句话的关系选择正确答案。
+
+Q:语句一:可是老人小心翼翼将蛇挑开,让它爬向草丛,嘴里念念有词:罪过,罪过,这本来是你的家呀
+语句二:老人心里十分难过。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+第一句话描述了个老人在小心地处理一条蛇的行的过程,老人说道“罪过,罪过”,说明他因为侵犯了蛇的领地而感到内疚和难过。可以推出第二句话,老人心里十分难过。因此两句话是蕴含关系。所以答案是(A)。
+
+Q:语句一:她是一个有着丰满的脸、丰满的嘴唇和大牙齿的黑色爆炸头女人。
+语句二:她喜欢抹红色的口红,穿红色的衣服。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+这两句话都在描述同一个女人,但是它们关注的特征是不同的。第一句话描述了她的身体特征,包括脸、嘴唇、牙齿和发型。第二句话描述了她的审美喜好,包括口红颜色和衣服颜色。这两句话并没有明显的蕴含或矛盾关系,因此,我们可以说这两句话是无关的。所以答案是(C)。
+
+Q:语句一:你不确定你已经清楚你站着谁的一面。
+语句二:你支持谁,这一点显而易见。
+请问这两句话什么关系?
+(A) 蕴含
+(B) 矛盾
+(C) 无关
+A:让我们一步一步来思考。
+第一句话表示你不确定你支持谁,而第二句话则明确表明你的立场是显而易见的,也就是说你清楚自己支持谁。因此,这两句话的内容是相互矛盾的。所以答案是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt
new file mode 100644
index 00000000..22e01f5c
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt
@@ -0,0 +1,22 @@
+请理解题目含义并选择正确答案。
+
+Q:一个人摄入的精制糖和在消化过程中分解成糖的食物几乎是该人所有葡萄糖(一种糖)的饮食来源的血液。然而, 咖啡在消化过程中本身不会分解成糖, 有时会导致人的血糖水平急剧上升, 即使咖啡没有加奶油或任何甜味剂。 以下哪项,如果为真,最有助于解释咖啡对如上所述的血糖水平的影响? 选项:
+(A) 久坐办公室工作的人更有可能喝大量咖啡和比从事需要持续体育锻炼的工作的人血糖水平更高。
+(B) 人们经常在饭后喝咖啡,这些食物由几种食物组成,这些食物在消化过程中会迅速分解成糖分。
+(C) 对许多人来说,一块巧克力蛋糕的消耗量与一杯普通咖啡相同的刺激效果。
+(D) 一小时内喝两杯以上的咖啡会增加人的压力水平,身体通过将储存的葡萄糖释放到血液中来应对压力。
+A:(D)
+
+Q:某人因为心理疾病尝试了几种不同的心理疗法:精神分析疗法,认知行为疗法以及沙盘游戏疗法.他说:"心理治疗过程让我非常不快乐,因此,这些疗法是无效的." 以下哪项如果为真,将最有力质疑上述的结论?? 选项:
+(A) 几种不同心理疗法所针对的心理疾病是不同的
+(B) 尝试多种心理疗法的人要比只尝试一种疗法的人快乐
+(C) 同时尝试不同心理疗法能够更容易找到可以起作用的方法
+(D) 治疗效果好的人在治疗过程中往往感觉不快乐
+A:(D)
+
+Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如果上述断定成立,则以下哪项一定为真?? 选项:
+(A) 没有可读性又没有精心包装的书一定不是畅销书
+(B) 没经过精心包装的书一定不是畅销书
+(C) 有可读性的书一定是畅销书
+(D) 大多数人喜欢有可读性的畅销书
+A:(A)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt
new file mode 100644
index 00000000..25120028
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt
@@ -0,0 +1,25 @@
+请理解题目含义并选择正确答案。
+
+Q:一个人摄入的精制糖和在消化过程中分解成糖的食物几乎是该人所有葡萄糖(一种糖)的饮食来源的血液。然而, 咖啡在消化过程中本身不会分解成糖, 有时会导致人的血糖水平急剧上升, 即使咖啡没有加奶油或任何甜味剂。 以下哪项,如果为真,最有助于解释咖啡对如上所述的血糖水平的影响? 选项:
+(A) 久坐办公室工作的人更有可能喝大量咖啡和比从事需要持续体育锻炼的工作的人血糖水平更高。
+(B) 人们经常在饭后喝咖啡,这些食物由几种食物组成,这些食物在消化过程中会迅速分解成糖分。
+(C) 对许多人来说,一块巧克力蛋糕的消耗量与一杯普通咖啡相同的刺激效果。
+(D) 一小时内喝两杯以上的咖啡会增加人的压力水平,身体通过将储存的葡萄糖释放到血液中来应对压力。
+A: Let's think step by step
+The key to the question is to identify a factor that can explain how coffee causes an increase in blood sugar levels, even though coffee itself does not break down into sugar during digestion. This requires us to find a mechanism that can explain how coffee indirectly affects blood sugar levels. Option (D) "Drinking two or more cups of coffee within an hour can increase a person's stress level, and the body responds to stress by releasing stored glucose into the bloodstream." If true, it is most helpful in explaining the impact of coffee on blood sugar levels. This option provides a possible mechanism that the stress response caused by coffee may lead to the release of stored glucose by the body, resulting in an increase in blood sugar levels. This explains why coffee (even without cream or sweeteners) can lead to an increase in blood sugar levels, even if it does not break down into sugar during digestion. So the answer is (D).
+
+Q:某人因为心理疾病尝试了几种不同的心理疗法:精神分析疗法,认知行为疗法以及沙盘游戏疗法.他说:"心理治疗过程让我非常不快乐,因此,这些疗法是无效的." 以下哪项如果为真,将最有力质疑上述的结论?? 选项:
+(A) 几种不同心理疗法所针对的心理疾病是不同的
+(B) 尝试多种心理疗法的人要比只尝试一种疗法的人快乐
+(C) 同时尝试不同心理疗法能够更容易找到可以起作用的方法
+(D) 治疗效果好的人在治疗过程中往往感觉不快乐
+A: Let's think step by step
+By understanding the question, it can be inferred that if unhappiness during psychotherapy is a sign of good treatment outcomes, then the person's conclusion that the therapy is ineffective because they feel unhappy during the treatment may be incorrect. He may have misunderstood that the difficulties and discomfort during the treatment process are manifestations of ineffective treatment, but in reality, this may be a sign that treatment is working. (D) If true, it would strongly question the above conclusion that people with good treatment outcomes often feel unhappy during the treatment process. So the answer is (D).
+
+Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如果上述断定成立,则以下哪项一定为真?选项:
+(A) 没有可读性又没有精心包装的书一定不是畅销书
+(B) 没经过精心包装的书一定不是畅销书
+(C) 有可读性的书一定是畅销书
+(D) 大多数人喜欢有可读性的畅销书
+A: Let's think step by step
+According to the above conclusion, "To become a bestseller, a book must have readability or be carefully packaged." Therefore, (A) "A book without readability or careful packaging is definitely not a bestseller.". Because if a book is neither readable nor carefully packaged, it does not meet the conditions to become a bestseller. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt
new file mode 100644
index 00000000..cab3f02a
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt
@@ -0,0 +1,61 @@
+请理解题目含义并选择正确答案。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:一个人摄入的精制糖和在消化过程中分解成糖的食物几乎是该人所有葡萄糖(一种糖)的饮食来源的血液。然而, 咖啡在消化过程中本身不会分解成糖, 有时会导致人的血糖水平急剧上升, 即使咖啡没有加奶油或任何甜味剂。 以下哪项,如果为真,最有助于解释咖啡对如上所述的血糖水平的影响? 选项:
+(A) 久坐办公室工作的人更有可能喝大量咖啡和比从事需要持续体育锻炼的工作的人血糖水平更高。
+(B) 人们经常在饭后喝咖啡,这些食物由几种食物组成,这些食物在消化过程中会迅速分解成糖分。
+(C) 对许多人来说,一块巧克力蛋糕的消耗量与一杯普通咖啡相同的刺激效果。
+(D) 一小时内喝两杯以上的咖啡会增加人的压力水平,身体通过将储存的葡萄糖释放到血液中来应对压力。
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: A person's intake of refined sugar and foods that break down into sugar during digestion is almost all of that person's dietary source of glucose (a type of sugar) in the bloodstream. However, coffee itself does not break down into sugar during digestion, which can sometimes cause a person's blood sugar levels to rise sharply, even if the coffee is not sweetened with cream or any sweetener. Which of the following, if true, would best help explain the effect of coffee on blood sugar levels as described above? Options:
+(A) People with sedentary office jobs are more likely to drink large amounts of coffee and to have higher blood sugar levels than people with jobs that require constant physical activity.
+(B) People often drink coffee with meals that consist of several foods that break down quickly into sugar during digestion.
+(C) For many people, a slice of chocolate cake consumes the same stimulating effect as a regular cup of coffee.
+(D) Drinking more than two cups of coffee in an hour increases a person's stress level, and the body responds to stress by releasing stored glucose into the bloodstream.
+Step-by-step answer:
+1.The key to the question is to identify a factor that can explain how coffee causes an increase in blood sugar levels, even though coffee itself does not break down into sugar during digestion.
+2.This requires us to find a mechanism that can explain how coffee indirectly affects blood sugar levels. Option (D) "Drinking two or more cups of coffee within an hour can increase a person's stress level, and the body responds to stress by releasing stored glucose into the bloodstream." If true, it is most helpful in explaining the impact of coffee on blood sugar levels. This option provides a possible mechanism that the stress response caused by coffee may lead to the release of stored glucose by the body, resulting in an increase in blood sugar levels.
+3.This explains why coffee (even without cream or sweeteners) can lead to an increase in blood sugar levels, even if it does not break down into sugar during digestion.
+So the answer is (D).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:某人因为心理疾病尝试了几种不同的心理疗法:精神分析疗法,认知行为疗法以及沙盘游戏疗法.他说:"心理治疗过程让我非常不快乐,因此,这些疗法是无效的." 以下哪项如果为真,将最有力质疑上述的结论?? 选项:
+(A) 几种不同心理疗法所针对的心理疾病是不同的
+(B) 尝试多种心理疗法的人要比只尝试一种疗法的人快乐
+(C) 同时尝试不同心理疗法能够更容易找到可以起作用的方法
+(D) 治疗效果好的人在治疗过程中往往感觉不快乐
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: A person has tried several different psychotherapies for a mental illness:psychoanalytic therapy, cognitive behavioral therapy, and sandplay therapy. He says: "The psychotherapy process made me very unhappy and, therefore, these therapies are ineffective." Which of the following, if true, would most strongly challenge the above conclusion? Options:
+(A) Several different psychotherapies target different mental illnesses.
+(B) People who try multiple psychotherapies are happier than those who try only one.
+(C) Trying different psychotherapies at the same time makes it easier to find the one that works
+(D) People who do well in therapy tend to feel unhappy during the therapy process
+Step-by-step answer:
+1.By understanding the question, it can be inferred that if unhappiness during psychotherapy is a sign of good treatment outcomes, then the person's conclusion that the therapy is ineffective because they feel unhappy during the treatment may be incorrect.
+2.He may have misunderstood that the difficulties and discomfort during the treatment process are manifestations of ineffective treatment, but in reality, this may be a sign that treatment is working. (D) If true, it would strongly question the above conclusion that people with good treatment outcomes often feel unhappy during the treatment process.
+So the answer is (D).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如果上述断定成立,则以下哪项一定为真?选项:
+(A) 没有可读性又没有精心包装的书一定不是畅销书
+(B) 没经过精心包装的书一定不是畅销书
+(C) 有可读性的书一定是畅销书
+(D) 大多数人喜欢有可读性的畅销书You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: For a book to be a bestseller,it must be readable or well packaged. If the above assertion holds true, which of the following must be true? Options:
+(A) A book that is not readable and not well-packaged is not a bestseller.
+(B) A book that is not well-packaged must not be a bestseller.
+(C) A book that is readable must be a bestseller
+(D) Most people like bestsellers that are readable.
+Step-by-step answer:
+1.According to the above conclusion, "To become a bestseller, a book must have readability or be carefully packaged." Therefore, (A) "A book without readability or careful packaging is definitely not a bestseller.".
+2.Because if a book is neither readable nor carefully packaged, it does not meet the conditions to become a bestseller.
+So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt
new file mode 100644
index 00000000..ef234967
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt
@@ -0,0 +1,25 @@
+请理解题目含义并选择正确答案。
+
+Q:一个人摄入的精制糖和在消化过程中分解成糖的食物几乎是该人所有葡萄糖(一种糖)的饮食来源的血液。然而, 咖啡在消化过程中本身不会分解成糖, 有时会导致人的血糖水平急剧上升, 即使咖啡没有加奶油或任何甜味剂。 以下哪项,如果为真,最有助于解释咖啡对如上所述的血糖水平的影响? 选项:
+(A) 久坐办公室工作的人更有可能喝大量咖啡和比从事需要持续体育锻炼的工作的人血糖水平更高。
+(B) 人们经常在饭后喝咖啡,这些食物由几种食物组成,这些食物在消化过程中会迅速分解成糖分。
+(C) 对许多人来说,一块巧克力蛋糕的消耗量与一杯普通咖啡相同的刺激效果。
+(D) 一小时内喝两杯以上的咖啡会增加人的压力水平,身体通过将储存的葡萄糖释放到血液中来应对压力。
+A:让我们一步一步来思考。
+题目的关键是要找出一个能够解释咖啡如何导致血糖水平上升的因素,尽管咖啡在消化过程中本身不会分解成糖。这就需要我们找到一个可以解释咖啡如何间接影响血糖水平的机制。选项 (D) "一小时内喝两杯以上的咖啡会增加人的压力水平,身体通过将储存的葡萄糖释放到血液中来应对压力" 如果为真,最有助于解释咖啡对血糖水平的影响。这个选项提供了一个可能的机制,即咖啡引起的压力反应可能导致身体释放储存的葡萄糖,从而导致血糖水平上升。这解释了为什么咖啡(即使没有加奶油或甜味剂)会导致血糖水平上升,即使它在消化过程中不会分解成糖。所以答案是(D)。
+
+Q:某人因为心理疾病尝试了几种不同的心理疗法:精神分析疗法,认知行为疗法以及沙盘游戏疗法.他说:"心理治疗过程让我非常不快乐,因此,这些疗法是无效的." 以下哪项如果为真,将最有力质疑上述的结论?? 选项:
+(A) 几种不同心理疗法所针对的心理疾病是不同的
+(B) 尝试多种心理疗法的人要比只尝试一种疗法的人快乐
+(C) 同时尝试不同心理疗法能够更容易找到可以起作用的方法
+(D) 治疗效果好的人在治疗过程中往往感觉不快乐
+A:让我们一步一步来思考。
+理解题目可知,如果心理治疗过程中的不快乐是治疗效果好的一个标志,那么这个人的结论——认为疗法无效,因为他在治疗过程中感到不快乐——就可能是错误的。他可能误解了治疗过程的困难和不适是治疗无效的表现,而实际上这可能是治疗正在起作用的标志。(D) "治疗效果好的人在治疗过程中往往感觉不快乐" 如果为真,将最有力质疑上述的结论。所以答案是(D)。
+
+Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如果上述断定成立,则以下哪项一定为真? 选项:
+(A) 没有可读性又没有精心包装的书一定不是畅销书
+(B) 没经过精心包装的书一定不是畅销书
+(C) 有可读性的书一定是畅销书
+(D) 大多数人喜欢有可读性的畅销书
+A:让我们一步一步来思考。
+根据上述断定,“一本书要成为畅销书,必须有可读性或者经过精心的包装”,所以 (A) "没有可读性又没有精心包装的书一定不是畅销书" 是一定为真的。因为如果一本书既没有可读性,也没有经过精心的包装,那么它就不满足成为畅销书的条件。所以答案是(A)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt
new file mode 100644
index 00000000..36638b45
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt
@@ -0,0 +1,22 @@
+根据上下文选择正确答案
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 大学,硕士,博士,博士后
+(B) 大学,博士,硕士,博士后
+(C) 大学,高中,博士后,博士
+(D) 博士后,大学,硕士,博士
+A:(A)
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 五月一日,七月十五日,八月十二日,六月十四日
+(B) 七月十五日,八月十二日,五月一日,六月十四日
+(C) 八月十二日,五月一日,七月十五日,六月十四日
+(D) 五月一日,六月十四日,七月十五日,八月十二日
+A:(D)
+
+Q:以下哪个列表按时间顺序排列正确?选项:
+(A) 通往天国的倒计时,世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌
+(B) 世纪末的魔术师,贝克街的亡灵,通往天国的倒计时,侦探们的镇魂歌
+(C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌
+(D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时
+A:(C)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..9b09ea7f
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt
@@ -0,0 +1,25 @@
+根据上下文选择正确答案
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 大学,硕士,博士,博士后
+(B) 大学,博士,硕士,博士后
+(C) 大学,高中,博士后,博士
+(D) 博士后,大学,硕士,博士
+A: Let's think step by step.
+The title options introduce four educational concepts: university, master's degree, doctoral degree, and postdoctoral degree. They are ranked in the order of obtaining these degrees. The first one obtained is a university degree, followed by a master's degree, then a doctoral degree, and finally a postdoctoral degree. So the answer is (A).
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 五月一日,七月十五日,八月十二日,六月十四日
+(B) 七月十五日,八月十二日,五月一日,六月十四日
+(C) 八月十二日,五月一日,七月十五日,六月十四日
+(D) 五月一日,六月十四日,七月十五日,八月十二日
+A: Let's think step by step.
+The question options introduce four dates, sorted in chronological order of the year as May 1st, June 14th, July 15th, and August 12th. So the answer is (D).
+
+Q:以下哪个列表按时间顺序排列正确?选项:
+(A) 通往天国的倒计时,世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌
+(B) 世纪末的魔术师,贝克街的亡灵,通往天国的倒计时,侦探们的镇魂歌
+(C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌
+(D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时
+A: Let's think step by step.
+The options mention the names of four Conan theater versions, among which the Magician of the End of the Century was released in 1999, the Undead on Baker Street was released in 2002, the Countdown to Heaven was released in 2001, and the Souls of Detectives was released in 2006. Therefore, in chronological order, they are the Magician of the End of the Century, the Countdown to Heaven, the Undead on Baker Street, and the Souls of Detectives. So the answer is (C).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt
new file mode 100644
index 00000000..e26e293c
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt
@@ -0,0 +1,60 @@
+根据上下文选择正确答案
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下哪个列表按照时间顺序排列正确?选项:
+(A) 大学,硕士,博士,博士后
+(B) 大学,博士,硕士,博士后
+(C) 大学,高中,博士后,博士
+(D) 博士后,大学,硕士,博士
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Which of the following lists is correct in chronological order? Options:
+(A) University, M.A., Ph.D., Post-doctoral
+(B) University, Ph.D., M.Phil, Post Ph.D.
+(C) University, High School, Post-doctoral, Doctoral
+(D) Post-doctoral, University, Master's, PhD
+Step-by-step answer:
+1.The title options introduce four educational concepts: university, master's degree, doctoral degree, and postdoctoral degree.
+2.They are ranked in the order of obtaining these degrees. The first one obtained is a university degree, followed by a master's degree, then a doctoral degree, and finally a postdoctoral degree.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下哪个列表按照时间顺序排列正确?选项:
+(A) 五月一日,七月十五日,八月十二日,六月十四日
+(B) 七月十五日,八月十二日,五月一日,六月十四日
+(C) 八月十二日,五月一日,七月十五日,六月十四日
+(D) 五月一日,六月十四日,七月十五日,八月十二日
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Which of the following list is correct in chronological order? Options:
+(A) May 1, July 15, August 12, June 14
+(B) July 15, August 12, May 1, June 14
+(C) August 12, May 1, July 15, June 14
+(D) May 1, June 14, July 15, August 12
+Step-by-step answer:
+1.The question options introduce four dates, sorted in chronological order of the year as May 1st, June 14th, July 15th, and August 12th.
+So the answer is (D).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:以下哪个列表按时间顺序排列正确?选项:
+(A) 通往天国的倒计时,世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌
+(B) 世纪末的魔术师,贝克街的亡灵,通往天国的倒计时,侦探们的镇魂歌
+(C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌
+(D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Which of the following lists is correct in chronological order? Options:
+(A) Countdown to Heaven, Magicians at the End of the Century, The Undead of Baker Street, Detectives' Requiem
+(B) The Magician at the End of the Century, The Undead of Baker Street, Countdown to Heaven, The Detectives' Requiem
+(C) The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, The Detectives' Requiem
+(D) The Magician at the End of the Century, The Undead of Baker Street, The Detectives' Requiem, The Countdown to Heaven
+Step-by-step answer:
+1.The options mention the names of four Conan theater versions, among which the Magician of the End of the Century was released in 1999, the Undead on Baker Street was released in 2002, the Countdown to Heaven was released in 2001, and the Souls of Detectives was released in 2006.
+2.Therefore, in chronological order, they are the Magician of the End of the Century, the Countdown to Heaven, the Undead on Baker Street, and the Souls of Detectives.
+So the answer is (C).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..b7f052a0
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt
@@ -0,0 +1,25 @@
+根据上下文选择正确答案
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 大学,硕士,博士,博士后
+(B) 大学,博士,硕士,博士后
+(C) 大学,高中,博士后,博士
+(D) 博士后,大学,硕士,博士
+A:让我们一步一步来思考。
+题目选项中引入了四个学历概念“大学,硕士,博士,博士后”,按照取得这些学历的顺序进行排序,首先获得的是大学学历,其次是硕士,然后是博士,最后是博士后。所以答案是(A)。
+
+Q:以下哪个列表按照时间顺序排列正确?选项:
+(A) 五月一日,七月十五日,八月十二日,六月十四日
+(B) 七月十五日,八月十二日,五月一日,六月十四日
+(C) 八月十二日,五月一日,七月十五日,六月十四日
+(D) 五月一日,六月十四日,七月十五日,八月十二日
+A:让我们一步一步来思考。
+题目选项中引入了四个日期,按照一年中的时间顺序排序为五月一日,六月十四日,七月十五日,八月十二日。所以答案是(D)。
+
+Q:以下哪个列表按时间顺序排列正确?选项:
+(A) 通往天国的倒计时,世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌
+(B) 世纪末的魔术师,贝克街的亡灵,通往天国的倒计时,侦探们的镇魂歌
+(C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌
+(D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时
+A:让我们一步一步来思考。
+选项中提到了四个柯南剧场版的名称,其中世纪末的魔术师是1999年上映的,贝克街的亡灵是2002年上映的,通往天国的倒计时是2001年上映的,侦探们的镇魂歌是2006年上映的,因此按时间排序是世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌。所以答案是(C)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt
new file mode 100644
index 00000000..1169a793
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt
@@ -0,0 +1,19 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "泰森打出右直拳"
+选项:
+(A) 可信
+(B) 不可信
+A: (A)
+
+Q: 下面的句子可信吗? "贝克汉姆完成了一次扣篮"
+选项:
+(A) 可信
+(B) 不可信
+A: (B)
+
+Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球"
+选项:
+(A) 可信
+(B) 不可信
+A: (B)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..b76e297b
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt
@@ -0,0 +1,22 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "泰森打出右直拳"
+选项:
+(A) 可信
+(B) 不可信
+A: Let's think step by step.
+Tyson is an American professional boxer. The right straight punch is a common move in boxing. It's quite normal for a boxer to throw a right straight punch, so this sentence is credible. The correct answer should be (A).
+
+Q: 下面的句子可信吗? "贝克汉姆完成了一次扣篮"
+选项:
+(A) 可信
+(B) 不可信
+A: Let's think step by step.
+Beckham is an English professional footballer. A slam dunk is a technical move in basketball. Therefore, this sentence is not credible. The correct answer should be (B).
+
+Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球"
+选项:
+(A) 可信
+(B) 不可信
+A: Let's think step by step.
+LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt
new file mode 100644
index 00000000..c8b26c29
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt
@@ -0,0 +1,57 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "泰森打出右直拳"
+选项:
+(A) 可信
+(B) 不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Is the following sentence plausible? "Tyson throws a straight right."
+Options:
+(A) Credible
+(B) Unbelievable
+Step-by-step answer:
+1.Tyson is a professional boxer from the American. The right straight punch is a common move in boxing.
+2.It's normal for a boxer to throw a right straight punch, so this sentence is credible.
+So the answer is (A).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "贝克汉姆完成了一次扣篮"
+选项:
+(A) 可信
+(B) 不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Is the following sentence plausible? "Beckham completed a slam dunk."
+Options:
+(A) Credible
+(B) Unbelievable
+Step-by-step answer:
+1.Beckham is a professional footballer from England.
+2.A slam dunk is a technical move in basketball.
+3.Therefore, this sentence is not credible.
+So the answer is (B).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:下面的句子可信吗? "勒布朗詹姆斯打进老鹰球"
+选项:
+(A) 可信
+(B) 不可信
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request: Is the following sentence believable? "LeBron James scored the Eagles' goal."
+Options:
+(A) Credible
+(B) Unbelievable
+Step-by-step answer:
+1.LeBron James is a well-known professional basketball player from the United States.
+2.An eagle is a term in golf, meaning two strokes under par.
+3.Therefore, this sentence is not credible.
+So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..451b3e02
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt
@@ -0,0 +1,22 @@
+判断一句人为构造的关于体育的句子是否可信。
+
+Q: 下面的句子可信吗? "泰森打出右直拳"
+选项:
+(A) 可信
+(B) 不可信
+A: 让我们一步一步来思考。
+泰森是一位美国职业拳击手。右直拳是拳击运动中的常用招式。作为拳击手使出右直拳这样的拳击动作是很正常的,所以这个句子是可信的,正确答案应该是(A)。
+
+Q: 下面的句子可信吗? "贝克汉姆完成了一次扣篮"
+选项:
+(A) 可信
+(B) 不可信
+A: 让我们一步一步来思考。
+贝克汉姆是一位英格兰的职业足球运动员。扣篮是篮球运动的技术动作。所以这个句子是不可信的,正确答案应该是(B)。
+
+Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球"
+选项:
+(A) 可信
+(B) 不可信
+A: 让我们一步一步来思考。
+勒布朗詹姆斯是一位知名的美国职业篮球运动员,老鹰球是高尔夫球术语,就是比标准杆少了两杆的意思。所以这个句子是不可信的,正确答案应该是(B)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt
new file mode 100644
index 00000000..e48a7359
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt
@@ -0,0 +1,27 @@
+从上下文中推断日期。
+
+Q: 请根据题目选择正确答案。今天是1937年圣诞节前夜。请问10天前的日期是多少?选项:
+(A)1988年12月14日
+(B)2026年12月14日
+(C)1950年12月14日
+(D)1938年07月14日
+(E)1937年12月14日
+(F)2007年12月14日
+A: (E)
+
+Q: 请根据题目选择正确答案。明天是2019年11月12日。请问一年前的日期是多少?选项:
+(A)2018年08月25日
+(B)2018年11月04日
+(C)2018年11月02日
+(D)2018年11月11日
+(E)2018年09月04日
+A: (D)
+
+Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚。今天是他们的5周年纪念日。请问明天的日期是多少?选项:
+(A)1961年01月18日
+(B)1982年01月03日
+(C)1961年01月11日
+(D)1960年10月14日
+(E)1963年01月03日
+(F)1960年12月03日
+A: (E)
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt
new file mode 100644
index 00000000..4182c81c
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt
@@ -0,0 +1,30 @@
+从上下文中推断日期。
+
+Q: 请根据题目选择正确答案。今天是1937年圣诞节前夜。请问10天前的日期是多少?选项:
+(A)1988年12月14日
+(B)2026年12月14日
+(C)1950年12月14日
+(D)1938年07月14日
+(E)1937年12月14日
+(F)2007年12月14日
+A: Let's think step by step.
+If today is Christmas Eve of 1937, then today's date is December 24, 1937. 10 days before today is December 14, 1937. So the answer is (E).
+
+Q: 请根据题目选择正确答案。明天是2019年11月12日。请问一年前的日期是多少?选项:
+(A)2018年08月25日
+(B)2018年11月04日
+(C)2018年11月02日
+(D)2018年11月11日
+(E)2018年09月04日
+A: Let's think step by step.
+If tomorrow is November 12, 2019, then today is November 11, 2019. The date one year ago from today is November 11, 2019. So the answer is (D).
+
+Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚。今天是他们的5周年纪念日。请问明天的日期是多少?选项:
+(A)1961年01月18日
+(B)1982年01月03日
+(C)1961年01月11日
+(D)1960年10月14日
+(E)1963年01月03日
+(F)1960年12月03日
+A: Let's think step by step.
+If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963. So the answer is (E).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt
new file mode 100644
index 00000000..f7ecad3f
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt
@@ -0,0 +1,71 @@
+从上下文中推断日期。
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:请根据题目选择正确答案。今天是1937年圣诞节前夜。请问10天前的日期是多少?选项:
+(A)1988年12月14日
+(B)2026年12月14日
+(C)1950年12月14日
+(D)1938年07月14日
+(E)1937年12月14日
+(F)2007年12月14日
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Choose the correct answer according to the question. Today is Christmas Eve 1937. What is the date 10 days ago? Options:
+(A) December 14, 1988
+(B)December 14, 2026
+(C)December 14, 1950
+(D)July 14, 1938
+(E)December 14, 1937
+(F) December 14, 2007
+Step-by-step answer:
+1.If today is Christmas Eve of 1937, then today's date is December 24, 1937.
+2.10 days before today is December 14, 1937.
+So the answer is (E).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:请根据题目选择正确答案。明天是2019年11月12日。请问一年前的日期是多少?选项:
+(A)2018年08月25日
+(B)2018年11月04日
+(C)2018年11月02日
+(D)2018年11月11日
+(E)2018年09月04日
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Select the correct answer as per the question. Tomorrow is 12/11/2019. What is the date one year ago? Options:
+(A) August 25, 2018
+(B)November 04, 2018
+(C)November 02, 2018
+(D)November 11, 2018
+(E) September 04, 2018
+Step-by-step answer:
+1.If tomorrow is November 12, 2019, then today is November 11, 2019.
+2.The date one year ago from today is November 11, 2019.
+So the answer is (D).
+
+I want you to act as a commonsense reasoning expert for Chinese.
+Request:请根据题目选择正确答案。李静和王伟在1958年1月2日结婚。今天是他们的5周年纪念日。请问明天的日期是多少?选项:
+(A)1961年01月18日
+(B)1982年01月03日
+(C)1961年01月11日
+(D)1960年10月14日
+(E)1963年01月03日
+(F)1960年12月03日
+You should retell the request in English.
+You should do the answer step by step to choose the right answer.
+You should step-by-step answer the request.
+You should tell me the answer in this format 'So the answer is'.
+Request:Select the correct answer according to the question. Li Jing and Wang Wei were married on 2/1/1958. Today is their 5th anniversary. What is the date for tomorrow, please? Options:
+(A) January 18th, 1961
+(B)January 03, 1982
+(C)January 11, 1961
+(D)October 14, 1960
+(E)January 03, 1963
+(F) December 03, 1960
+Step-by-step answer:
+1.If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963.
+2.The date tomorrow is Jan 3, 1963.
+So the answer is (E).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt
new file mode 100644
index 00000000..e5ff431c
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt
@@ -0,0 +1,30 @@
+从上下文中推断日期。
+
+Q: 请根据题目选择正确答案。今天是1937年圣诞节前夜。请问10天前的日期是多少?选项:
+(A)1988年12月14日
+(B)2026年12月14日
+(C)1950年12月14日
+(D)1938年07月14日
+(E)1937年12月14日
+(F)2007年12月14日
+A: 让我们一步一步来思考。
+如果今天是1937年的圣诞节前夕,那么今天的日期就是1937年12月24日。今天之前的10天是1937年12月14日,也就是12/14/1937。所以答案是(E)。
+
+Q: 请根据题目选择正确答案。明天是2019年11月12日。请问一年前的日期是多少?选项:
+(A)2018年08月25日
+(B)2018年11月04日
+(C)2018年11月02日
+(D)2018年11月11日
+(E)2018年09月04日
+A: 让我们一步一步来思考。
+如果明天是2019年11月12日,那么今天就是2019年11月11日。从今天起算去年的这一天是2018年11月11日。所以答案是(D)。
+
+Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚。今天是他们的5周年纪念日。请问明天的日期是多少?选项:
+(A)1961年01月18日
+(B)1982年01月03日
+(C)1961年01月11日
+(D)1960年10月14日
+(E)1963年01月03日
+(F)1960年12月03日
+A: 让我们一步一步来思考。
+如果李静和王伟在1958年1月2日结婚,那么如果今天是他们结婚五周年纪念日,那么今天的日期就是1963年1月2日。明天的日期是1963年1月3日,也就是1961年01月03日。所以答案是(E)。
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt
new file mode 100644
index 00000000..3012dc76
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt
@@ -0,0 +1,25 @@
+Identify whether a given statement contains an era error.
+
+Q: Choose the correct option if the following statement contains an anachronism. How would a person with compulsory education and basic common sense choose?
+Li Bai wrote poetry with a fountain pen.
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+The statement mentions "Li Bai", a Chinese poet from the Tang Dynasty. The "fountain pen" mentioned in the statement is a modern device, so Li Bai could not have used a fountain pen to write his poems, and the statement contains an anachronism. The answer is (A).
+
+Q: Choose the correct option if the following statement contains an anachronism. What would a person with compulsory education and basic common sense choose?
+During the Tang Dynasty, people could buy Dream of Red Mansions on the streets.
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+This statement mentions Dream of Red Mansions, which was written by Cao Xueqin, a writer of the Qing Dynasty, and there was no Dream of Red Mansions during the Tang Dynasty, so the statement contains a period error. So the answer is (A).
+
+Q: Whether the following statement contains an epochal error, choose the correct option. What would a person with compulsory education and basic common sense choose?
+On October 1, 1998, people held the five-star red flag in their hands and celebrated the National Day in Tiananmen Square
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+The statement mentions the year 1998, New China was founded in 1949 and the five-star red flag was established as the national flag of China in 1949, so it is reasonable to expect that October 1, 1998 was a National Day and people celebrated the National Day in Tiananmen Square, so the statement does not contain an anachronism. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt
new file mode 100644
index 00000000..92b08fb1
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt
@@ -0,0 +1,40 @@
+Given based on the given list of artworks, identify the most similar.
+
+Q: The movies that have something in common with these movies Crazy Aliens, Bullitt, Killjoys and Crazy Stone are:
+Options.
+(A) 'Lost in Thailand'
+(B)The Young and the Restless
+(C)The Children of the River and the Lake
+(D)The Mekong Operation
+A: Let's think step by step.
+Crazy Aliens is a comedy sci-fi film directed by Ning Hao, written by Liu Cixin and Sun Xiaohang, and starring Huang Bo, Shen Teng and Xu Zheng, released in 2019.
+Bullfight is a black comedy film directed by Guan Hu and starring Huang Bo and Yan Ni, released in 2009.
+Killing Life is an absurd suspense comedy film directed by Guan Hu and co-starring Huang Bo, Ren Dahua, Su Youpeng and Yu Nan, released in 2012.
+Crazy Stone is a black comedy film directed by Ning Hao and starring Guo Tao, Liu Hua, Lian Jin, Liu Gang, Xu Zheng and Huang Bo, released in 2006.
+These are all famous classic Chinese comedy movies with Huang Bo in them, and out of all the options, the only movie that seems to have something in common with these movies is Lost in Thailand (a comedy film directed by Xu Zheng and starring Huang Bo, Xu Zheng and Wang Baoqiang, released in 2012). So the answer is (A).
+
+Q: The movie that has something in common with these movies 'Red High Beam', 'Alive', 'Big Red Lantern' and 'Hero' is:
+Option.
+(A) 'Not One Less'
+(B)Let the Bullets Fly
+(C)The Legend of Ah Fei
+(D)East is East, West is West
+A: Let's think step by step.
+Red Sorghum, a war literary film directed by Zhang Yimou and starring Jiang Wen, Gong Li and Teng Rujun, was released in China in 1987.
+Alive, a drama film directed by Zhang Yimou and starring Ge You and Gong Li, was released in China in 1994.
+Hanging the Red Lantern High is a drama film directed by Zhang Yimou and starring Gong Li, He Saifei, Ma Jingwu, Cao CuiFen, Kong Lin, and Jin Shuyuan, released in China in 1991.
+Hero is a martial arts film directed by Zhang Yimou and starring Jet Li, Tony Leung, Maggie Cheung, Chen Dao Ming, Zhang Ziyi and Donnie Yen, released in China in 2002.
+These are all famous classic Chinese movies directed by Zhang Yimou. Among all the choices, the only movie that seems to have something in common with these movies is Not One Less (a drama movie directed by Zhang Yimou and starring Wei Minzhi and Zhang Huike, released in 1999). So the answer is (A).
+
+Q: The songs that have something in common with these songs 'Nocturne', 'Materia Medica', 'Listen to Your Mother' and 'Seven Miles' are:
+Option.
+(A) 'Nunchaku'
+(B) 'The Young and the Restless'
+(C) 'Pomp and Circumstance'
+(D) "Three's a Crowd
+A: Let's think step by step.
+Nocturne is a song performed by Jay Chou, written by Fong, composed by Jay Chou, and arranged by Lin Maike, and included in Jay Chou's 2005 album Chopin in November
+"Materia Medica" is a song by Jay Chou, written by Fong, composed by Jay Chou, and arranged by Michael Lam, and featured on Jay Chou's 2006 album Still Vantage Point.
+Listen to Your Mother" is a song by Jay Chou, composed by Jay Chou and arranged by Lin Maike and Hong Jingyao, included in Jay Chou's 2006 album "Still Vantage".
+Seven Miles is a song performed by Jay Chou, with lyrics by Fong, music by Jay Chou, and arrangement by Chung Hsing-min, included in Jay Chou's 2004 album of the same name, Seven Miles.
+These are all famous Chinese pop songs sung by Jay Chou. Among all the choices, the only song that seems to have something in common with these songs is "Nunchaku" (sung by Jay Chou, composed by Jay Chou, lyrics by Fong Wen-shan, arranged by Chung Hsing-min, and featured in Jay Chou's 2001 album "Fantasia"). So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt
new file mode 100644
index 00000000..a0cc8333
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt
@@ -0,0 +1,28 @@
+Choose the correct answer based on the relationship between the two sentences in the question.
+
+Q:Statement 1: Xiaoming has just gone ice-skating in Shichahai
+Statement 2: It's winter in Beijing
+What is the relationship between these two statements?
+(A) Implicit
+(B) Contradictory
+(C) Irrelevant
+A: Let's think step by step.
+The first sentence mentions that Xiaoming went ice-skating in Shichahai, which is usually done in winter, and Shichahai is located in Beijing, which implies the information in the second sentence, that is, it is winter at present. So the answer is (A).
+
+Q:Statement 1:Next week, a heavy rainfall will affect the whole Hebei province
+Statement 2: Next week, the weather in Shanghai will be fine.
+What is the relationship between these two statements?
+(A) Implied
+(B) Contradictory
+(C) Irrelevant
+A: Let's think step by step.
+These two sentences describe the weather in two geographical locations, one in Hebei Province and the other in Shanghai. Hebei Province and Shanghai are geographically far apart, so the weather conditions in these two places are not necessarily directly related. Therefore, the relationship between the two sentences is irrelevant. So the answer is (C).
+
+Q:Statement 1: Yesterday, Xiao Ming watched the sunset on a hill and exclaimed, "The sunset is infinite, but it's just near dusk"
+Statement 2: Yesterday it rained and Ming didn't go out
+What is the relationship between these two statements?
+(A) implied
+(B) contradictory
+(C) Irrelevant
+A: Let's think step by step.
+The first sentence says that Ming saw the sunset on the mountain yesterday, while the second sentence says that it rained yesterday and Ming didn't go out. There is a contradiction between these two sentences because if Ming didn't go out, then he couldn't have seen the sunset on the mountain. So there is a contradiction between these two sentences. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt
new file mode 100644
index 00000000..2cd6aab0
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt
@@ -0,0 +1,26 @@
+Please understand the meaning of the question and choose the correct answer.
+
+Q:Some Cantonese people don't like chili peppers. Therefore, some southerners don't like chili peppers. Which of the following ensures the validity of the above argument?
+(A) Some Cantonese people love chili peppers
+(B) Some Southerners love chili peppers.
+(C) All Cantonese are Southerners.
+(D) Some Cantonese people do not love chili or sweets.
+A: Let's think step by step.
+In this argument, we infer from "some Cantonese don't like chili peppers" that "some Southerners don't like chili peppers". This inference depends on the relationship between Cantonese and Southerners. In order for this inference to be valid, we need to make sure that at least some Cantonese are Southerners. Therefore, option (C) "All Cantonese are southerners" is the one that ensures that this argument is valid. So the answer is (C).
+
+
+Q:Thangka is a form of painting rich in Tibetan cultural characteristics, which has a history of more than 1300 years since the rise of the Tubo Dynasty and is a cultural treasure of the Snowy Plateau. It is a cultural treasure of the Snowy Plateau. Its subject matter is not only religious, but also historical and folklore content, so it is also known as the "encyclopedia" to understand Tibet. Therefore, if you want to understand the history of Tibet, in addition to sitting down and reading the strict but slightly dull history books, you can also choose a pleasant and enjoyable way - enjoying the thangka, conversing with many ancient people, imagining the historical events, savoring the rich spiritual world of the Tibetans, and understanding the unique folklore of the Tibetans, which is a process to make the history become a three-dimensional and palpable. This is a process of making history three-dimensional and palpable.
+(A) Thangkas can provide a cozy and relaxing way to learn about Tibet.
+(B) The thangkas are a unique record of Tibetan folklore and historical events.
+(C) The thangka is an "encyclopedic" form of painting for understanding Tibetan culture and history.
+(D) The thangka is a form of painting that is rich in Tibetan cultural characteristics and has a long history.
+A: Let's think step by step.
+It is clearly mentioned in the text that apart from reading rigorous but dull history books, appreciating thangkas is a pleasant and enjoyable way to talk with many ancient people, imagine historical events, experience the rich spiritual world of the Tibetans, and learn about the unique Tibetan folklore. So the main intention of the passage is (A) "Thangkas can provide a cozy and relaxing way to learn about Tibet." So the answer is (A).
+
+Q:As a principle and method of literary criticism, the concept of "knowing people and discussing the world" was first proposed by Mencius, a thinker of the Warring States period. According to Mencius, in order to make friends with the ancients, it is not enough just to read their poems and books, but also to understand their behavior and the times in which they lived, so as to read and understand their poems and books, and to make friends with them and become their soulmates. What is the incorrect understanding of this passage?
+(A) People's hearts can communicate and resonate with each other.
+(B) "Knowing people and discussing the world" has been used as a kind of literary criticism and has shown a strong vitality.
+(C) "Knowing people and discussing the world" can help the descendants to make friends with the ancients and the ancients to become their soulmates.
+(D) Knowing about the ancients and their times helps to understand their works.
+A: Let us think step by step
+From the passage, we fail to see that (B) "Knowing the people of the world" has been used as a kind of literary criticism and has shown a strong vitality. Although it is true that "knowing people and discussing the world" is a principle and method of literary criticism put forward by the philosopher Mencius during the Warring States period, the passage does not mention that "knowing people and discussing the world" is still in use today, nor does it mention that it has shown great vitality. Therefore, option (B) is a misconception. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..cfa2236a
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt
@@ -0,0 +1,25 @@
+Choose the correct answer based on the context
+
+Q: The following characters are correctly ordered in chronological order? Options:
+(A) Qin Shi Huang, Zhuge Liang, Liu Bang, Bai Juyi
+(B) Zhuge Liang, Qin Shi Huang, Liu Bang, Bai Ju Yi
+(C) Qin Shi Huang, Liu Bang, Zhu Geliang, Bai Juyi
+(D) Bai Juyi, Zhu Geliang, Liu Bang, Qin Shi Huang
+A: Let's think step by step.
+There are 4 characters mentioned in the options, in which Qin Shi Huang is from Qin Dynasty, Zhuge Liang is from Three Kingdoms period, Liu Bang is from Han Dynasty period and Bai Juyi is from Tang Dynasty period, according to the chronological order they are Qin Dynasty, Han Dynasty, Three Kingdoms period and Tang Dynasty, so the characters are sequentially Qin Shi Huang, Liu Bang, Zhuge Liang and Bai Juyi. So the answer is (C).
+
+Q:The following words describing age are correctly ordered from youngest to oldest age? Options:
+(A) Unconfused, Established, Knowledge of life, Flowering age
+(B) To stand, not to be confused, to know one's destiny, and to be in the prime of life.
+(C) Being in the prime of life, knowing one's destiny, being in the prime of life, not being confused.
+(D) to stand up, to grow old, to be unperturbed, to know one's destiny
+A: Let us think step by step.
+The options mention four words describing age. "Established" means 30 years old, "not perplexed" means 40 years old, "knowing one's destiny" means 50 years old, and "at the age of 60" means 50 years old. Therefore, the order of age from youngest to oldest is "to be established", "not to be perplexed", "to know one's life" and "to be at the end of one's life". So the answer is (B).
+
+Q:Which of the following steps is correct for making old-fashioned lollipops? Options:
+(A) Preparing the ingredients, pouring the syrup into the molds, making the syrup, cooling to set the shape
+(B) Prepare ingredients, make syrup, pour syrup into molds, cool to set
+(C) Prepare ingredients, pour syrup into mold, cool and set, make syrup
+(D) Prepare ingredients, cool and set, make syrup, pour syrup into molds
+A: Let's think step by step.
+The question refers to the steps in making old-fashioned lollipops, and the options include "prepare ingredients", "pour syrup into molds", and "make syrup", "According to the steps of making old-fashioned lollipops, the first step is to prepare the ingredients, then you need to make the syrup, then pour the syrup into the molds, and finally cool and set the lollipops. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..830b38cb
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt
@@ -0,0 +1,22 @@
+Determine whether a human-constructed sentence about sports is plausible.
+
+Q: Is the following sentence plausible? "Athlete Zhang Peimeng's consistent passes made the offensive end take the initiative"
+Options:
+(A) Credible
+(B) Not credible
+A: Let's think step by step.
+Zhang Peimeng is a Chinese track and field athlete, and first pass is a volleyball term. Since he is not a volleyball player, he would not be involved in the passing part of a volleyball game. So the answer is (B).
+
+Q: Is the following sentence plausible? "Athlete Guo Allen scored a free kick."
+Options:
+(A) Credible
+(B) Unbelievable
+A: Let's think step by step.
+Guo Allen is a Chinese basketball player, and free kick is a soccer term. Since he is not a soccer player, he won't be in involved in the free kick part of a soccer game, so the answer is (B).
+
+Q: Is the following sentence plausible? "The athlete Fan Zhiyi scored a goal."
+Options:
+(A) Credible
+(B) Not credible
+A: Let's think step by step.
+Fan Zhiyi is a Chinese soccer player, and scoring a goal is a soccer sport term. Since he is a professional soccer player, he is likely to score a goal, so the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..ad20ce8d
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt
@@ -0,0 +1,28 @@
+Choose the correct answer based on the context.
+
+Q: Siu Wah was born in the year C Shen and his brother is 6 years older than him, so in which year was his brother born?
+Options:
+(A) Year of Ji Mao
+(B) The year of Geng Yin
+(C) Bingshen year
+(D) The year of Yi Chou
+A: Let's think step by step.
+Xiaohua was born in the year of Cingshen. His brother is six years older than him, and according to the Chinese Heavenly Stems and Earthly Branches dating system, the sixth year before the year of Cingshen is the year of Geng Yin. So the answer is (B).
+
+Q: If this is the year of A-chen, and Li Hua's grandfather said, "This is the year I know my destiny," what is Zhao Ting's grandfather's zodiac sign?
+Options:
+(A) Dog
+(B) Tiger
+(C) Rooster
+(D) Rat
+A: Let's think step by step.
+In the title, it is mentioned that grandpa is "in the year of knowing his destiny". In ancient China, "in the year of knowing his destiny" refers to the age of 50, so grandpa is 50 years old this year, and this year is the year of A-chen. According to the method of chronology of the Chinese Heavenly Stems and Earthly Branches, the year of grandpa's birth was the year of A-yin, and the year of A-yin is the year of the Tiger, so grandpa is a Tiger. The year he was born is the year of the tiger, so he is a tiger. So the answer is (B).
+
+Q: Li started making a drawing at 3:00 p.m. in the morning, and he knew that it would take him 10 hours to finish it. So what is the earliest hour at which he can finish it?
+Options:
+ (A) Tatsu
+ (B) U.M. hour
+ (C) Midday
+ (D) the hour of midnight
+A: Let's think step by step.
+According to the ancient Chinese method of timekeeping, three moments of the Ohio hour means approximately 21:45, and 10 hours later it is 7:45, which is the hour of the hour of the moon. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt
new file mode 100644
index 00000000..97148105
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt
@@ -0,0 +1,25 @@
+Identify whether a given statement contains an era error.
+
+Q: How would a person with compulsory education and basic common sense answer whether the following statement contains an anachronism?
+During the landing operations on Iwo Jima, Ralph spoke loudly into the radio.
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+The statement refers to "during the Iwo Jima landings," which is a historical event during World War II. At that time, it was accurate and appropriate for military personnel to use radios to communicate with each other. So the answer is (B).
+
+Q: Does the following statement contain an anachronism and how would a person with a compulsory education and basic common sense answer?
+During the landing operations on Iwo Jima, Ralph speaks loudly into his iPhone.
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+The statement mentions "during the Iwo Jima landings," which refers to a historical event during World War II. However, the reference to Ralph speaking loudly into his iPhone introduces an anachronism; the iPhone is a modern smartphone that did not exist during the Iwo Jima landings in 1945. So the answer is (A).
+
+Q: How would a person with compulsory education and basic common sense answer whether the following statement contains an anachronism?
+Nothing is more satisfying than Sally Hemmings knitting a sweater while watching HBO's True Detective.
+Options:
+(A) Yes
+(B) No
+A: Let's think step by step.
+This statement mentions Sally Hemings, an enslaved woman in late 18th and early 19th century America. However, the reference to her knitting while watching HBO's True Detective introduces an anachronism. Television did not exist in the time period in which Sally Hemings lived, and the specific reference to watching a particular program like True Detective is clearly inappropriate for that time period in history. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Movie_and_Music_Recommendation_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Movie_and_Music_Recommendation_Translate-EN.txt
new file mode 100644
index 00000000..33426fe3
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Movie_and_Music_Recommendation_Translate-EN.txt
@@ -0,0 +1,40 @@
+Given based on the given list of artworks, identify the most similar.
+
+Q: Find a movie similar to Braveheart, Pretty Woman, Schindler's List, Apollo 13:
+Options:
+(A) Star Trek.
+(B) My Grand Greek Wedding 2
+(C)The Santa Clause 2
+(D) Dances with Wolves
+A: Let's think step by step.
+Star Trek is a science fiction movie, and while it has depth and complexity, its themes and style are quite different from the four movies mentioned above.
+MY GREAT GREEK WEDDING 2 is a light-hearted comedy movie that differs considerably in theme and style from the four movies mentioned above.
+Santa Claus 2 is a Christmas-themed family movie that also differs considerably in theme and style from the four movies mentioned above.
+Dances with Wolves is a movie depicting the relationship with Native Americans during the pioneering period of the American West.
+The theme and style of one of the movies, Dances with Wolves, matches the four movies mentioned above. This movie, like Braveheart, Pretty Woman, Schindler's List, and Apollo 13, is a historical movie with depth and seriousness. So the answer is (D).
+
+Q: Find a movie similar to Braveheart, Pretty Woman, Apollo 13, Dances with Wolves:
+Options:
+(A) Batman: Assault on Arkham
+(B)The Shawshank Redemption
+(C) "Toy Story
+(D) The Lion King
+A: Let's think step by step.
+Batman: Assault on Arkham is a superhero movie whose theme and style are quite different from the four movies mentioned above.
+The Shawshank Redemption is a 1994 American drama film directed by Frank Delabonte and starring Tim Robbins and Morgan Freeman; it's a movie about hope and perseverance.
+Toy Story is an animated movie, and while it may have some themes of adventure and friendship, its themes and style differ considerably from the four movies mentioned above.
+The Lion King, while it is a classic animated movie that covers themes of courage and growing up, has themes and styles that differ considerably from the four movies mentioned above.
+One of the movies, The Shawshank Redemption, like Braveheart, Pretty Woman, Apollo 13, and Dances with Wolves, is a movie with depth and seriousness and has a similar theme and style to the other three movies. So the answer is (B).
+
+Q: Find a movie similar to Astonishing, Braveheart, Lowlife, Schindler's List:
+Options:
+(A) The Cabin of Dr. Caligari
+(B)The Shawshank Redemption
+(C) 'Spider-Man 2'
+(D)Taxi
+A: Let's think step by step.
+The Cabin of Dr. Caligari is a 1920 German Expressionist silent film directed by Robert Wiene. It is often considered a landmark work of German Expressionist cinema and one of the earliest horror films.
+The Shawshank Redemption is a 1994 American drama film directed by Frank Delabonte and starring Tim Robbins and Morgan Freeman; it is a movie about hope and perseverance.
+Spider-Man 2 is a 2004 American superhero film directed by Sam Raimi and starring Tobey Maguire as the second installment of the Spider-Man trilogy.
+Cab is a 2004 American comedy-action film directed by Tim Story and starring Jimmy Foran and Quentin Latafa. The movie is an American remake of the 1998 French film.
+And the titles, "The Thing," "Braveheart," "Lowlife," and "Schindler's List" are some very deep, richly plotted movies with strong human themes. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances and profound themes. Among the options is The Shawshank Redemption, which tells the story of how a wrongly accused banker keeps hope alive in prison and eventually escapes. With its deeply moving plot and profound depiction of human nature, this movie has many similarities with the movie in the title. So the answer is (B).
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt
new file mode 100644
index 00000000..bdf47e5e
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt
@@ -0,0 +1,28 @@
+Choose the correct answer based on the relationship between the two sentences in the question.
+
+Q:Statement 1: But the old man carefully picked the snake away from the grass, letting it crawl towards the grass, mumbling: sin, sin, this was your home!
+Statement 2: The old man was very sad in his heart.
+What is the relationship between these two statements?
+(A) Contained
+(B) contradictory
+(C) Irrelevant
+A: Let's think step by step.
+The first sentence describes an old man carefully handling a snake's row. The old man says, "Sin, sin, sin," indicating that he feels guilty and sad because he has violated the snake's territory. The second sentence can be inferred from the fact that the old man was very sad. Therefore, the two sentences are implicitly related. So the answer is (A).
+
+Q:Statement 1: She was a black explosive-headed woman with a plump face, full lips and big teeth.
+Statement 2: She likes to wear red lipstick and red clothes.
+What is the relationship between these two statements?
+(A) implied
+(B) contradictory
+(C) Unrelated
+A: Let's think step by step.
+Both sentences describe the same woman, but they focus on different features. The first sentence describes her physical features, including her face, lips, teeth, and hair. The second sentence describes her aesthetic preferences, including lipstick color and dress color. There is no obvious implication or contradiction between these two sentences, so we can say that they are unrelated. So the answer is (C).
+
+Q:Statement 1: You're not sure you've made it clear whose side you're on.
+Statement 2: It is obvious whose side you are on.
+What is the relationship between these two statements?
+(A) Contained
+(B) Contradictory
+(C) Unrelated
+A: Let's think step by step.
+The first sentence indicates that you are not sure who you support, while the second sentence makes it clear that your position is obvious, i.e. you know exactly who you support. Therefore, the two sentences contradict each other. So the answer is (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt
new file mode 100644
index 00000000..777de423
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt
@@ -0,0 +1,25 @@
+Please understand the meaning of the question and choose the correct answer.
+
+Q:A person's intake of refined sugar and foods that break down into sugar during digestion is almost all of that person's dietary source of glucose (a type of sugar) in the bloodstream. However, coffee itself does not break down into sugar during digestion, which can sometimes cause a person's blood sugar levels to rise sharply, even if the coffee is not sweetened with cream or any sweetener. Which of the following, if true, would best help explain the effect of coffee on blood sugar levels as described above? Options:
+(A) People with sedentary office jobs are more likely to drink large amounts of coffee and to have higher blood sugar levels than people with jobs that require constant physical activity.
+(B) People often drink coffee with meals that consist of several foods that break down quickly into sugar during digestion.
+(C) For many people, a slice of chocolate cake consumes the same stimulating effect as a regular cup of coffee.
+(D) Drinking more than two cups of coffee in an hour increases a person's stress level, and the body responds to stress by releasing stored glucose into the bloodstream.
+A: Let's think step by step.
+The key to the question is to find a factor that explains how coffee causes blood sugar levels to rise, even though coffee itself does not break down into sugar during digestion. This requires us to find a mechanism that can explain how coffee indirectly affects blood sugar levels. Option (D) "Drinking more than two cups of coffee in an hour increases a person's stress levels, and the body responds to stress by releasing stored glucose into the bloodstream" would, if true, best help to explain the effect of coffee on blood sugar levels. This option provides a possible mechanism by which the coffee-induced stress response may cause the body to release stored glucose, leading to an increase in blood sugar levels. This explains why coffee (even without creamer or sweetener) causes blood sugar levels to rise, even though it does not break down into sugar during digestion. So the answer is (D).
+
+Q:A person has tried several different psychotherapies for a mental illness:psychoanalytic therapy,cognitive behavioral therapy and sandplay therapy. He says: "The psychotherapy process made me very unhappy and, therefore, these therapies are ineffective." Which of the following, if true, would most strongly challenge the above conclusion? Options:
+(A) Several different psychotherapies target different mental illnesses.
+(B) People who try multiple psychotherapies are happier than those who try only one.
+(C) Trying different psychotherapies at the same time makes it easier to find the one that works
+(D) People who do well in therapy tend to feel unhappy during the therapy process
+A: Let's think step by step.
+Understanding the question shows that if unhappiness during psychotherapy is a sign of a good therapeutic outcome, then the person's conclusion - that the therapy is ineffective because he feels unhappy during the process - may be wrong. He may have misinterpreted the difficulty and discomfort of the therapy process as a sign that the therapy is ineffective, when in fact it may be a sign that the therapy is working. (D) "People whose therapy is working well tend to feel unhappy during the therapy process" If true, this would most strongly challenge the above conclusion. So the answer is (D).
+
+Q:For a book to be a bestseller,it must be readable or well packaged. If the above assertion holds true, which of the following must be true? Option:
+(A) A book that is not readable and not well-packaged must not be a bestseller.
+(B) A book that is not carefully packaged must not be a bestseller.
+(C) A book that is readable must be a bestseller
+(D) Most people like bestsellers that are readable.
+A: Let's think step by step.
+According to the above assertion, "For a book to be a bestseller, it must be readable or well-packaged", so (A) "A book that is neither readable nor well-packaged must not be a bestseller" must be true. Because if a book is neither readable nor well-packaged, then it does not fulfill the conditions for being a bestseller. So the answer is (A).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..b8193c85
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt
@@ -0,0 +1,25 @@
+Choose the correct answer based on the context
+
+Q:Which of the following lists is correct in chronological order? Options:
+(A) University, M.A., Ph.D., Post-doctoral
+(B) University, Ph.D., M.Phil, Post Ph.D.
+(C) University, High School, Post-doctoral, Doctoral
+(D) Post-doctoral, University, Master's, PhD
+A: Let's think step by step.
+The options in the question introduce the concept of four qualifications "University, Masters, PhD, Post-doctoral" and in the order in which these qualifications were obtained, the first qualification obtained is University, followed by Masters, then PhD and finally Post-doctoral. So the answer is (A).
+
+Q:Which of the following list is correct in chronological order? Options:
+(A) May 1, July 15, August 12, June 14
+(B) July 15, August 12, May 1, June 14
+(C) August 12, May 1, July 15, June 14
+(D) May 1, June 14, July 15, August 12
+A: Let's think step by step.
+The question option introduces four dates which are May 1, June 14, July 15, August 12 in chronological order of the year. So the answer is (D).
+
+Q:Which of the following lists is correct in chronological order? Options:
+(A) Countdown to Heaven, Magicians at the End of the Century, The Undead of Baker Street, Detectives' Requiem
+(B) The Magician at the End of the Century, The Undead of Baker Street, Countdown to Heaven, The Detectives' Requiem
+(C) The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, The Detectives' Requiem
+(D) The Magician at the End of the Century, The Undead of Baker Street, The Detectives' Requiem, The Countdown to Heaven
+A: Let's think step by step.
+The options mention the names of the four Conan theatrical releases. The Magician at the End of the Century was released in 1999, The Undead of Baker Street was released in 2002, Countdown to Heaven was released in 2001, and The Detectives' Requiem was released in 2006, so in chronological order it's The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, and The Detectives' Requiem. So the answer is (C).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..ef56e501
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt
@@ -0,0 +1,22 @@
+Determine whether a human-constructed sentence about sports is plausible.
+
+Q: Is the following sentence credible? "Tyson threw a right straight punch."
+Options:
+(A) Credible
+(B) Not credible
+A: Let's think step by step.
+Tyson is an American professional boxer. The right straight punch is a common move in boxing. It's quite normal for a boxer to throw a right straight punch, so this sentence is credible. The correct answer should be (A).
+
+Q: Is the following sentence credible? "Beckham completed a slam dunk."
+Options:
+(A) Credible
+(B) Not credible
+A: Let's think step by step.
+Beckham is an English professional footballer. A slam dunk is a technical move in basketball. Therefore, this sentence is not credible. The correct answer should be (B).
+
+Q: Is the following sentence credible? "LeBron James made an eagle."
+Options:
+(A) Credible
+(B) Not credible
+A: Let's think step by step.
+LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B).
\ No newline at end of file
diff --git a/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt
new file mode 100644
index 00000000..f22f8dd5
--- /dev/null
+++ b/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt
@@ -0,0 +1,30 @@
+Infer dates from context.
+
+Q: Choose the correct answer according to the question. Today is Christmas Eve 1937. What is the date 10 days ago? Options:
+(A) December 14, 1988
+(B)December 14, 2026
+(C)December 14, 1950
+(D)July 14, 1938
+(E)December 14, 1937
+(F) December 14, 2007
+A: Let's think step by step.
+If today is Christmas Eve, 1937, then today's date is December 24, 1937. The 10 days before today is December 14, 1937, which is 12/14/1937. so the answer is (E).
+
+Q: Select the correct answer as per the question. Tomorrow is 12/11/2019. What is the date one year ago? Options:
+(A) August 25, 2018
+(B)November 04, 2018
+(C)November 02, 2018
+(D)November 11, 2018
+(E) September 04, 2018
+A: Let's think step by step.
+If tomorrow is November 12, 2019, then today is November 11, 2019. This day last year from today is November 11, 2018 So the answer is (D).
+
+Q: Select the correct answer according to the question. Li Jing and Wang Wei were married on 2/1/1958. Today is their 5th anniversary. What is the date for tomorrow, please? Options:
+(A) January 18th, 1961
+(B)January 03, 1982
+(C)January 11, 1961
+(D)October 14, 1960
+(E)January 03, 1963
+(F) December 03, 1960
+A: Let's think step by step.
+If Li Jing and Wang Wei were married on January 2, 1958, then today's date would be January 2, 1963 if today is their fifth wedding anniversary. Tomorrow's date will be January 3, 1963, which is January 03, 1961. So the answer is (E).
\ No newline at end of file
diff --git a/configs/datasets/GaokaoBench/README.md b/configs/datasets/GaokaoBench/README.md
new file mode 100644
index 00000000..71b0ffac
--- /dev/null
+++ b/configs/datasets/GaokaoBench/README.md
@@ -0,0 +1,191 @@
+# GaokaoBench
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+```
+
+## Base Models
+
+| model | GaokaoBench |
+|:------------------------:|--------------:|
+| llama-7b-turbomind | 14.55 |
+| llama-13b-turbomind | 16.20 |
+| llama-30b-turbomind | 16.14 |
+| llama-65b-turbomind | 13.31 |
+| llama-2-7b-turbomind | 15.02 |
+| llama-2-13b-turbomind | 14.86 |
+| llama-2-70b-turbomind | 16.36 |
+| llama-3-8b-turbomind | 20.88 |
+| llama-3-70b-turbomind | 19.98 |
+| internlm2-1.8b-turbomind | 23.78 |
+| internlm2-7b-turbomind | 41.41 |
+| internlm2-20b-turbomind | 58.99 |
+| qwen-1.8b-turbomind | 22.11 |
+| qwen-7b-turbomind | 35.32 |
+| qwen-14b-turbomind | 54.07 |
+| qwen-72b-turbomind | 77.56 |
+| qwen1.5-0.5b-hf | 30.67 |
+| qwen1.5-1.8b-hf | 35.66 |
+| qwen1.5-4b-hf | 54.31 |
+| qwen1.5-7b-hf | 65.99 |
+| qwen1.5-14b-hf | 66.60 |
+| qwen1.5-32b-hf | 79.01 |
+| qwen1.5-72b-hf | 80.26 |
+| qwen1.5-moe-a2-7b-hf | 52.79 |
+| mistral-7b-v0.1-hf | 14.35 |
+| mistral-7b-v0.2-hf | 11.10 |
+| mixtral-8x7b-v0.1-hf | 8.40 |
+| mixtral-8x22b-v0.1-hf | 16.23 |
+| yi-6b-hf | 31.70 |
+| yi-34b-hf | 30.51 |
+| deepseek-7b-base-hf | 17.02 |
+| deepseek-67b-base-hf | 10.14 |
+
+### Details
+
+| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
+|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+| llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 |
+| llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 |
+| llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 |
+| llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 |
+| llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 |
+| llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 |
+| llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 |
+| llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 |
+| llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 |
+| internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 |
+| internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 |
+| internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 |
+| qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 |
+| qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 |
+| qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 |
+| qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 |
+| qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 |
+| qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 |
+| qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 |
+| qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 |
+| qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 |
+| qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 |
+| qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 |
+| qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 |
+| mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 |
+| mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 |
+| mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 |
+| mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 |
+| yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 |
+| yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 |
+| deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 |
+| deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 |
+
+| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+| llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 |
+| llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 |
+| llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 |
+| llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 |
+| llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 |
+| llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 |
+| llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 |
+| llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 |
+| llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 |
+| internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 |
+| internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 |
+| internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 |
+| qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 |
+| qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 |
+| qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 |
+| qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 |
+| qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 |
+| qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 |
+| qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 |
+| qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 |
+| qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 |
+| qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 |
+| qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 |
+| qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 |
+| mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 |
+| mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 |
+| mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 |
+| mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 |
+| yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 |
+| yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 |
+| deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 |
+| deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 |
+
+## Chat Models
+
+| model | GaokaoBench |
+|:-----------------------------:|--------------:|
+| qwen1.5-0.5b-chat-hf | 21.51 |
+| qwen1.5-1.8b-chat-hf | 46.19 |
+| qwen1.5-4b-chat-hf | 59.11 |
+| qwen1.5-7b-chat-hf | 70.55 |
+| qwen1.5-14b-chat-hf | 80.39 |
+| qwen1.5-32b-chat-hf | 86.15 |
+| qwen1.5-72b-chat-hf | 88.58 |
+| qwen1.5-110b-chat-hf | 89.59 |
+| internlm2-chat-1.8b-hf | 29.73 |
+| internlm2-chat-1.8b-sft-hf | 28.79 |
+| internlm2-chat-7b-hf | 54.54 |
+| internlm2-chat-7b-sft-hf | 55.39 |
+| internlm2-chat-20b-hf | 57.95 |
+| internlm2-chat-20b-sft-hf | 57.62 |
+| llama-3-8b-instruct-hf | 45.48 |
+| llama-3-70b-instruct-hf | 65.91 |
+| llama-3-8b-instruct-lmdeploy | 44.48 |
+| llama-3-70b-instruct-lmdeploy | 67.06 |
+| mistral-7b-instruct-v0.1-hf | 26.21 |
+| mistral-7b-instruct-v0.2-hf | 32.17 |
+| mixtral-8x7b-instruct-v0.1-hf | 42.46 |
+
+### Details
+
+| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
+|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+| qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 |
+| qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 |
+| qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 |
+| qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 |
+| qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 |
+| qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 |
+| qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 |
+| qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 |
+| internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 |
+| internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 |
+| internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 |
+| internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 |
+| internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 |
+| internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 |
+| llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 |
+| llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 |
+| llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 |
+| llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 |
+| mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 |
+| mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 |
+| mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 |
+
+| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+| qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 |
+| qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 |
+| qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 |
+| qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 |
+| qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 |
+| qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 |
+| qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 |
+| qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 |
+| internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 |
+| internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 |
+| internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 |
+| internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 |
+| internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 |
+| internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 |
+| llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 |
+| llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 |
+| llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 |
+| llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 |
+| mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 |
+| mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 |
+| mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 |
diff --git a/configs/datasets/IFEval/README.md b/configs/datasets/IFEval/README.md
new file mode 100644
index 00000000..3753262c
--- /dev/null
+++ b/configs/datasets/IFEval/README.md
@@ -0,0 +1,31 @@
+# IFEval
+
+```bash
+python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
+```
+
+## Chat Models
+
+| model | Prompt-level-strict-accuracy | Inst-level-strict-accuracy | Prompt-level-loose-accuracy | Inst-level-loose-accuracy |
+|:-----------------------------:|-------------------------------:|-----------------------------:|------------------------------:|----------------------------:|
+| qwen1.5-0.5b-chat-hf | 13.12 | 23.26 | 15.71 | 26.38 |
+| qwen1.5-1.8b-chat-hf | 16.08 | 26.26 | 18.30 | 29.02 |
+| qwen1.5-4b-chat-hf | 25.51 | 35.97 | 28.84 | 39.81 |
+| qwen1.5-7b-chat-hf | 38.82 | 50.00 | 42.70 | 53.48 |
+| qwen1.5-14b-chat-hf | 42.51 | 54.20 | 49.17 | 59.95 |
+| qwen1.5-32b-chat-hf | 49.54 | 60.43 | 53.97 | 64.39 |
+| qwen1.5-72b-chat-hf | 51.02 | 61.99 | 57.12 | 67.27 |
+| qwen1.5-110b-chat-hf | 55.08 | 65.59 | 61.18 | 70.86 |
+| internlm2-chat-1.8b-hf | 18.30 | 28.78 | 21.44 | 32.01 |
+| internlm2-chat-1.8b-sft-hf | 18.67 | 31.18 | 19.78 | 32.85 |
+| internlm2-chat-7b-hf | 34.75 | 46.28 | 40.48 | 51.44 |
+| internlm2-chat-7b-sft-hf | 39.19 | 50.12 | 42.33 | 52.76 |
+| internlm2-chat-20b-hf | 36.41 | 48.68 | 40.67 | 53.24 |
+| internlm2-chat-20b-sft-hf | 44.55 | 55.64 | 46.77 | 58.03 |
+| llama-3-8b-instruct-hf | 68.02 | 76.74 | 75.42 | 82.85 |
+| llama-3-70b-instruct-hf | 78.00 | 84.65 | 84.29 | 89.21 |
+| llama-3-8b-instruct-lmdeploy | 69.13 | 77.46 | 77.26 | 83.93 |
+| llama-3-70b-instruct-lmdeploy | 75.97 | 82.97 | 83.18 | 88.37 |
+| mistral-7b-instruct-v0.1-hf | 40.30 | 50.96 | 41.96 | 53.48 |
+| mistral-7b-instruct-v0.2-hf | 49.17 | 60.43 | 51.94 | 64.03 |
+| mixtral-8x7b-instruct-v0.1-hf | 50.09 | 60.67 | 55.64 | 65.83 |
diff --git a/configs/datasets/MathBench/mathbench_agent_gen_48ec47.py b/configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py
similarity index 100%
rename from configs/datasets/MathBench/mathbench_agent_gen_48ec47.py
rename to configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py
diff --git a/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py b/configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py
similarity index 100%
rename from configs/datasets/MathBench/mathbench_arith_gen_ccd638.py
rename to configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py
diff --git a/configs/datasets/MathBench/mathbench_cot_gen_66f329.py b/configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py
similarity index 100%
rename from configs/datasets/MathBench/mathbench_cot_gen_66f329.py
rename to configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py
diff --git a/configs/datasets/MathBench/mathbench_gen_7b734b.py b/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py
similarity index 100%
rename from configs/datasets/MathBench/mathbench_gen_7b734b.py
rename to configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py
diff --git a/configs/datasets/MathBench/mathbench_2024_gen_19e486.py b/configs/datasets/MathBench/mathbench_2024_gen_19e486.py
deleted file mode 100644
index 6cde6c69..00000000
--- a/configs/datasets/MathBench/mathbench_2024_gen_19e486.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
- 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
- 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
- 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
- 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
-}
-
-cloze_prompts = {
- 'cloze_cn': [
- dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
- dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
- dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
- dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
- dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
- dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
- dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
- dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
- dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
- dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
- dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
- dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
- dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
- dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
- dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
- dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
- dict(role='HUMAN', prompt='Q: {question}'),
- dict(role='BOT', prompt='A: {answer}'),
- ],
- 'cloze_en': [
- dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
- dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
- dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
- dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
- dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
- dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
- dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
- dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
- dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
- dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
- dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
- dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
- dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
- dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
- dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
- dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
- dict(role='HUMAN', prompt='Q: {question}'),
- dict(role='BOT', prompt='A: {answer}\n'),
-]}
-
-mathbench_sets = {
- # Practice Part
- 'college': ['single_choice_cn', 'single_choice_en'],
- 'high': ['single_choice_cn', 'single_choice_en'],
- 'middle': ['single_choice_cn', 'single_choice_en'],
- 'primary': ['cloze_cn', 'cloze_en'],
- 'arithmetic': ['cloze_en'],
- # Theory part
- 'college_knowledge': ['single_choice_cn','single_choice_en'],
- 'high_knowledge': ['single_choice_cn','single_choice_en'],
- 'middle_knowledge': ['single_choice_cn','single_choice_en'],
- 'primary_knowledge': ['single_choice_cn','single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = True
-
-# Use circular evaluation or not
-with_circular_eval = True
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
- for _name in mathbench_sets[_split]:
- mathbench_infer_cfg = dict(
- prompt_template=dict(
- type=PromptTemplate,
- template=dict(
- round=[
- dict(
- role='HUMAN',
- prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
- ),
- dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
- ),
- ),
- retriever=dict(type=ZeroRetriever),
- inferencer=dict(type=GenInferencer, max_out_len=2048),
- )
-
- mathbench_eval_cfg = dict(
- evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
- pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
-
- mathbench_datasets.append(
- dict(
- abbr='mathbench-' + _split + '-' + _name,
- type=MathBenchDataset,
- path=f'./data/mathbench_v1/{_split}',
- name=_name,
- with_circular=with_circular_eval,
- reader_cfg=dict(
- input_columns=['question'],
- output_column='answer'
- ),
- infer_cfg=mathbench_infer_cfg,
- eval_cfg=mathbench_eval_cfg,
- ))
diff --git a/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
new file mode 100644
index 00000000..af8e4cd8
--- /dev/null
+++ b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+ prompts = zero_shot_prompts
+else:
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+ for _name in mathbench_sets[_split]:
+ if 'single_choice' in _name:
+ if with_reasoning:
+ template_round = prompts[_name + '_with_reasoning']
+ else:
+ template_round = prompts[_name]
+ else:
+ template_round = prompts[_name]
+
+ if 'single_choice' in _name:
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+ else:
+ pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+ if 'single_choice' in _name and with_circular_eval:
+ evaluator = dict(type=CircularEvaluator)
+ else:
+ evaluator = dict(type=AccEvaluator)
+
+ # assemble the final config
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+ if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+ template = {}
+ for answer in ['A', 'B', 'C', 'D']:
+ one_template_round = deepcopy(template_round)
+ one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+ template[answer] = dict(round=one_template_round)
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=template),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=PPLInferencer),
+ )
+ else:
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
+ )
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+ mathbench_datasets.append(
+ dict(
+ abbr='mathbench-' + _split + '-' + _name,
+ type=MathBenchDataset,
+ path=f'data/mathbench_v1/{_split}',
+ name=_name,
+ with_circular=with_circular_eval,
+ reader_cfg=mathbench_reader_cfg,
+ infer_cfg=mathbench_infer_cfg,
+ eval_cfg=mathbench_eval_cfg,
+ )
+ )
diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py
new file mode 100644
index 00000000..67b7847c
--- /dev/null
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = False
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+ prompts = zero_shot_prompts
+else:
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+ for _name in mathbench_sets[_split]:
+ if 'single_choice' in _name:
+ if with_reasoning:
+ template_round = prompts[_name + '_with_reasoning']
+ else:
+ template_round = prompts[_name]
+ else:
+ template_round = prompts[_name]
+
+ if 'single_choice' in _name:
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+ else:
+ pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+ if 'single_choice' in _name and with_circular_eval:
+ evaluator = dict(type=CircularEvaluator)
+ else:
+ evaluator = dict(type=AccEvaluator)
+
+ # assemble the final config
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+ if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+ template = {}
+ for answer in ['A', 'B', 'C', 'D']:
+ one_template_round = deepcopy(template_round)
+ one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+ template[answer] = dict(round=one_template_round)
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=template),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=PPLInferencer),
+ )
+ else:
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
+ )
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+ mathbench_datasets.append(
+ dict(
+ abbr='mathbench-wocircular-' + _split + '-' + _name,
+ type=MathBenchDataset,
+ path=f'data/mathbench_v1/{_split}',
+ name=_name,
+ with_circular=with_circular_eval,
+ reader_cfg=mathbench_reader_cfg,
+ infer_cfg=mathbench_infer_cfg,
+ eval_cfg=mathbench_eval_cfg,
+ )
+ )
diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py
deleted file mode 100644
index 7cd6790c..00000000
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
- 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
- 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
- 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
- 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
-}
-
-cloze_prompts = {
- 'cloze_cn': [
- dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
- dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
- dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
- dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
- dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
- dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
- dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
- dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
- dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
- dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
- dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
- dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
- dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
- dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
- dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
- dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
- dict(role='HUMAN', prompt='Q: {question}'),
- dict(role='BOT', prompt='A: {answer}'),
- ],
- 'cloze_en': [
- dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
- dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
- dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
- dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
- dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
- dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
- dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
- dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
- dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
- dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
- dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
- dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
- dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
- dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
- dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
- dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
- dict(role='HUMAN', prompt='Q: {question}'),
- dict(role='BOT', prompt='A: {answer}\n'),
- ],
-}
-
-mathbench_sets = {
- # Practice Part
- 'college': ['single_choice_cn', 'single_choice_en'],
- 'high': ['single_choice_cn', 'single_choice_en'],
- 'middle': ['single_choice_cn', 'single_choice_en'],
- 'primary': ['cloze_cn', 'cloze_en'],
- 'calculate': ['cloze_en'],
- # Theory part
- 'college_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'high_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = True
-
-# Use circular evaluation or not
-with_circular_eval = False
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
- for _name in mathbench_sets[_split]:
- mathbench_reader_cfg = dict(
- input_columns=['question'],
- output_column='answer',
- )
-
- if 'single_choice' in _name:
- template_round = [
- dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])),
- dict(role='BOT', prompt='{answer}')
- ]
- else:
- template_round = cloze_prompts[_name]
- mathbench_infer_cfg = dict(
- prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
- retriever=dict(type=ZeroRetriever),
- inferencer=dict(type=GenInferencer, max_out_len=512),
- )
-
- if 'single_choice' in _name:
- pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
- else:
- pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
- if 'single_choice' in _name and with_circular_eval:
- evaluator = dict(type=CircularEvaluator)
- else:
- evaluator = dict(type=AccEvaluator)
- mathbench_eval_cfg = dict(
- evaluator=evaluator,
- pred_postprocessor=pred_postprocessor,
- )
-
- mathbench_datasets.append(
- dict(
- abbr='mathbench-' + _split + '-' + _name,
- type=MathBenchDataset,
- path=f'./data/mathbench_v1_ori/{_split}',
- name=_name,
- with_circular=with_circular_eval,
- reader_cfg=mathbench_reader_cfg,
- infer_cfg=mathbench_infer_cfg,
- eval_cfg=mathbench_eval_cfg,
- )
- )
diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py
new file mode 100644
index 00000000..ac698c20
--- /dev/null
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = False
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = True
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+ prompts = zero_shot_prompts
+else:
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+ for _name in mathbench_sets[_split]:
+ if 'single_choice' in _name:
+ if with_reasoning:
+ template_round = prompts[_name + '_with_reasoning']
+ else:
+ template_round = prompts[_name]
+ else:
+ template_round = prompts[_name]
+
+ if 'single_choice' in _name:
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+ else:
+ pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+ if 'single_choice' in _name and with_circular_eval:
+ evaluator = dict(type=CircularEvaluator)
+ else:
+ evaluator = dict(type=AccEvaluator)
+
+ # assemble the final config
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+ if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+ template = {}
+ for answer in ['A', 'B', 'C', 'D']:
+ one_template_round = deepcopy(template_round)
+ one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer)
+ template[answer] = dict(round=one_template_round)
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=template),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=PPLInferencer),
+ )
+ else:
+ mathbench_infer_cfg = dict(
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
+ )
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+ mathbench_datasets.append(
+ dict(
+ abbr='mathbench-wocircular-' + _split + '-' + _name,
+ type=MathBenchDataset,
+ path=f'data/mathbench_v1/{_split}',
+ name=_name,
+ with_circular=with_circular_eval,
+ reader_cfg=mathbench_reader_cfg,
+ infer_cfg=mathbench_infer_cfg,
+ eval_cfg=mathbench_eval_cfg,
+ )
+ )
diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py
deleted file mode 100644
index 9f6d08c2..00000000
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
- 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
- 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n',
- 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
- 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n',
-}
-
-cloze_prompts = {
- 'cloze_cn': [
- 'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6',
- 'Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5',
- 'Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39',
- 'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8',
- 'Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9',
- 'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29',
- 'Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33',
- 'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8',
- 'Q: {question}\nA: {answer}',
- ],
- 'cloze_en': [
- 'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.',
- 'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.',
- "Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
- 'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.',
- 'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.',
- 'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.',
- 'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.',
- 'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.',
- 'Q: {question}\nA: {answer}',
- ],
-}
-
-mathbench_sets = {
- # Practice Part
- 'college': ['single_choice_cn', 'single_choice_en'],
- 'high': ['single_choice_cn', 'single_choice_en'],
- 'middle': ['single_choice_cn', 'single_choice_en'],
- 'primary': ['cloze_cn', 'cloze_en'],
- 'calculate': ['cloze_en'],
- # Theory part
- 'college_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'high_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
- 'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = False
-
-# Use circular evaluation or not
-with_circular_eval = False
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
- for _name in mathbench_sets[_split]:
- mathbench_reader_cfg = dict(
- input_columns=['question'],
- output_column='answer',
- )
-
- if 'single_choice' in _name:
- if with_reasoning:
- mathbench_infer_cfg = dict(
- prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']),
- retriever=dict(type=ZeroRetriever),
- inferencer=dict(type=GenInferencer, max_out_len=512),
- )
- else:
- mathbench_infer_cfg = dict(
- prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}),
- retriever=dict(type=ZeroRetriever),
- inferencer=dict(type=PPLInferencer),
- )
- else:
- mathbench_infer_cfg = dict(
- prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])),
- retriever=dict(type=ZeroRetriever),
- inferencer=dict(type=GenInferencer, max_out_len=512),
- )
-
-
- if 'single_choice' in _name:
- pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
- else:
- pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
- if 'single_choice' in _name and with_circular_eval:
- evaluator = dict(type=CircularEvaluator)
- else:
- evaluator = dict(type=AccEvaluator)
- mathbench_eval_cfg = dict(
- evaluator=evaluator,
- pred_postprocessor=pred_postprocessor,
- )
-
- mathbench_datasets.append(
- dict(
- abbr='mathbench-' + _split + '-' + _name,
- type=MathBenchDataset,
- path=f'./data/mathbench_v1_ori/{_split}',
- name=_name,
- with_circular=with_circular_eval,
- reader_cfg=mathbench_reader_cfg,
- infer_cfg=mathbench_infer_cfg,
- eval_cfg=mathbench_eval_cfg,
- )
- )
diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py
index 63478a60..90e8e5fe 100644
--- a/configs/datasets/MathBench/mathbench_gen.py
+++ b/configs/datasets/MathBench/mathbench_gen.py
@@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
- from .mathbench_2024_gen_19e486 import mathbench_datasets # noqa: F401, F403
+ from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403
diff --git a/configs/datasets/MathBench/mathbench_prompt.py b/configs/datasets/MathBench/mathbench_prompt.py
new file mode 100644
index 00000000..069528ee
--- /dev/null
+++ b/configs/datasets/MathBench/mathbench_prompt.py
@@ -0,0 +1,103 @@
+zero_shot_prompts = {
+ 'single_choice_cn_with_reasoning': [
+ dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:'),
+ ],
+ 'single_choice_cn': [
+ dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n所以答案是:'),
+ ],
+ 'single_choice_en_with_reasoning': [
+ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
+ ],
+ 'single_choice_en': [
+ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
+ ],
+}
+
+few_shot_prompts = {
+ 'single_choice_cn': [
+ dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+ dict(role='BOT', prompt='回答: D'),
+ dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+ dict(role='BOT', prompt='回答: D'),
+ dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+ dict(role='BOT', prompt='回答: A'),
+ dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2,高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \\pi$时,该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+ dict(role='BOT', prompt='回答: B'),
+ dict(role='HUMAN', prompt='问题: {question}'),
+ dict(role='BOT', prompt='回答: {answer}'),
+ ],
+ 'single_choice_cn_with_reasoning' : [
+ dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+ dict(role='BOT', prompt='回答: 因为首先,我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$,以便消去分母中的虚数部分:$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$,这样做的目的是利用复数乘法的性质,从而消去分母中的虚数部分。我们进行乘法运算:$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$,$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$,由于$i^2 = -1$,我们可以进一步简化:$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$,在复平面上,这个点位于第四象限,因为它的实部是正的($\\frac{1}{2}$),而虚部是负的($-\\frac{1}{2}$)\n答案选项:D'),
+ dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+ dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$(其中$\\omega>0$)的图像向左平移2个单位长度后,得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合,我们需要满足以下条件:\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性,我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$,其中$k$是任意整数。因此,我们可以写出:\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程,我们得到:$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$,我们需要找到满足条件的最小正整数$k$。当$k=0$时,我们得到$\\omega=2$,这是满足条件的最小正值。因此,最小值等于2。答案选项:D'),
+ dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+ dict(role='BOT', prompt='回答: 我们可以将这个方程写成:$a + 2ai + b = 2i$,然后将实部和虚部分开:\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$,我们可以得到$a = -b$。将这个值代入$2a = 2$,我们得到$2(-b) = 2$,因此$b = -1$。因此,$a = 1$。答案选项:A'),
+ dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4.则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
+ dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们,如果一个三角形的两条边长分别为$a$和$b$,那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中,$a = 3$,$b = 4$,因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项:A'),
+ dict(role='HUMAN', prompt='问题: {question}'),
+ dict(role='BOT', prompt='回答: {answer}'),
+ ],
+ 'single_choice_en': [
+ dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+ dict(role='BOT', prompt='Response: D'),
+ dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+ dict(role='BOT', prompt='Response: D'),
+ dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+ dict(role='BOT', prompt='Response: A'),
+ dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+ dict(role='BOT', prompt='Response: B'),
+ dict(role='HUMAN', prompt='Question: {question}'),
+ dict(role='BOT', prompt='Response: {answer}'),
+ ],
+ 'single_choice_en_with_reasoning': [
+ dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+ dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
+ dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+ dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
+ dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+ dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
+ dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+ dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
+ dict(role='HUMAN', prompt='Question: {question}'),
+ dict(role='BOT', prompt='Response: {answer}'),
+ ],
+ 'cloze_cn': [
+ dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
+ dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9'),
+ dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
+ dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29'),
+ dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
+ dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33'),
+ dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
+ dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8'),
+ dict(role='HUMAN', prompt='问题: {question}'),
+ dict(role='BOT', prompt='回答: {answer}'),
+ ],
+ 'cloze_en': [
+ dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
+ dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
+ dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
+ dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
+ dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
+ dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
+ dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
+ dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
+ dict(role='HUMAN', prompt='Question: {question}'),
+ dict(role='BOT', prompt='Response: {answer}'),
+ ],
+}
+
+mathbench_sets = {
+ # Practice Part
+ 'college': ['single_choice_cn', 'single_choice_en'],
+ 'high': ['single_choice_cn', 'single_choice_en'],
+ 'middle': ['single_choice_cn', 'single_choice_en'],
+ 'primary': ['cloze_cn', 'cloze_en'],
+ 'arithmetic': ['cloze_en'],
+ # Theory part
+ 'college_knowledge': ['single_choice_cn', 'single_choice_en'],
+ 'high_knowledge': ['single_choice_cn', 'single_choice_en'],
+ 'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
+ 'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
+}
diff --git a/configs/datasets/TheoremQA/README.md b/configs/datasets/TheoremQA/README.md
new file mode 100644
index 00000000..ab5e57f1
--- /dev/null
+++ b/configs/datasets/TheoremQA/README.md
@@ -0,0 +1,69 @@
+# TheoremQA
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+```
+
+## Base Models
+
+| model | TheoremQA |
+|:------------------------:|------------:|
+| llama-7b-turbomind | 10.25 |
+| llama-13b-turbomind | 11.25 |
+| llama-30b-turbomind | 14.25 |
+| llama-65b-turbomind | 15.62 |
+| llama-2-7b-turbomind | 12.62 |
+| llama-2-13b-turbomind | 11.88 |
+| llama-2-70b-turbomind | 15.62 |
+| llama-3-8b-turbomind | 20.25 |
+| llama-3-70b-turbomind | 33.62 |
+| internlm2-1.8b-turbomind | 10.50 |
+| internlm2-7b-turbomind | 21.88 |
+| internlm2-20b-turbomind | 26.00 |
+| qwen-1.8b-turbomind | 9.38 |
+| qwen-7b-turbomind | 15.00 |
+| qwen-14b-turbomind | 21.62 |
+| qwen-72b-turbomind | 27.12 |
+| qwen1.5-0.5b-hf | 5.88 |
+| qwen1.5-1.8b-hf | 12.00 |
+| qwen1.5-4b-hf | 13.75 |
+| qwen1.5-7b-hf | 4.25 |
+| qwen1.5-14b-hf | 12.62 |
+| qwen1.5-32b-hf | 26.62 |
+| qwen1.5-72b-hf | 26.62 |
+| qwen1.5-moe-a2-7b-hf | 7.50 |
+| mistral-7b-v0.1-hf | 17.00 |
+| mistral-7b-v0.2-hf | 16.25 |
+| mixtral-8x7b-v0.1-hf | 24.12 |
+| mixtral-8x22b-v0.1-hf | 36.75 |
+| yi-6b-hf | 13.88 |
+| yi-34b-hf | 24.75 |
+| deepseek-7b-base-hf | 12.38 |
+| deepseek-67b-base-hf | 21.25 |
+
+## Chat Models
+
+| model | TheoremQA |
+|:-----------------------------:|------------:|
+| qwen1.5-0.5b-chat-hf | 9.00 |
+| qwen1.5-1.8b-chat-hf | 9.25 |
+| qwen1.5-4b-chat-hf | 13.88 |
+| qwen1.5-7b-chat-hf | 12.25 |
+| qwen1.5-14b-chat-hf | 13.63 |
+| qwen1.5-32b-chat-hf | 19.25 |
+| qwen1.5-72b-chat-hf | 22.75 |
+| qwen1.5-110b-chat-hf | 17.50 |
+| internlm2-chat-1.8b-hf | 13.63 |
+| internlm2-chat-1.8b-sft-hf | 12.88 |
+| internlm2-chat-7b-hf | 18.50 |
+| internlm2-chat-7b-sft-hf | 18.75 |
+| internlm2-chat-20b-hf | 23.00 |
+| internlm2-chat-20b-sft-hf | 25.12 |
+| llama-3-8b-instruct-hf | 19.38 |
+| llama-3-70b-instruct-hf | 36.25 |
+| llama-3-8b-instruct-lmdeploy | 19.62 |
+| llama-3-70b-instruct-lmdeploy | 34.50 |
+| mistral-7b-instruct-v0.1-hf | 12.62 |
+| mistral-7b-instruct-v0.2-hf | 11.38 |
+| mixtral-8x7b-instruct-v0.1-hf | 26.00 |
diff --git a/configs/datasets/bbh/README.md b/configs/datasets/bbh/README.md
new file mode 100644
index 00000000..34aee487
--- /dev/null
+++ b/configs/datasets/bbh/README.md
@@ -0,0 +1,250 @@
+# BBH
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
+```
+
+## Base Models
+
+| model | bbh |
+|:------------------------:|------:|
+| llama-7b-turbomind | 33.34 |
+| llama-13b-turbomind | 37.99 |
+| llama-30b-turbomind | 49.86 |
+| llama-65b-turbomind | 58.26 |
+| llama-2-7b-turbomind | 38.27 |
+| llama-2-13b-turbomind | 45.68 |
+| llama-2-70b-turbomind | 64.78 |
+| llama-3-8b-turbomind | 59.69 |
+| llama-3-70b-turbomind | 79.16 |
+| internlm2-1.8b-turbomind | 36.03 |
+| internlm2-7b-turbomind | 63.56 |
+| internlm2-20b-turbomind | 71.29 |
+| qwen-1.8b-turbomind | 22.53 |
+| qwen-7b-turbomind | 45.89 |
+| qwen-14b-turbomind | 56.75 |
+| qwen-72b-turbomind | 63.35 |
+| qwen1.5-0.5b-hf | 20.54 |
+| qwen1.5-1.8b-hf | 27.01 |
+| qwen1.5-4b-hf | 34.81 |
+| qwen1.5-7b-hf | 39.87 |
+| qwen1.5-14b-hf | 50.38 |
+| qwen1.5-32b-hf | 67.47 |
+| qwen1.5-72b-hf | 58.81 |
+| qwen1.5-moe-a2-7b-hf | 39.46 |
+| mistral-7b-v0.1-hf | 56.71 |
+| mistral-7b-v0.2-hf | 57.32 |
+| mixtral-8x7b-v0.1-hf | 68.46 |
+| mixtral-8x22b-v0.1-hf | 79.48 |
+| yi-6b-hf | 44.82 |
+| yi-34b-hf | 66.37 |
+| deepseek-7b-base-hf | 42.88 |
+| deepseek-67b-base-hf | 71.86 |
+
+### Details
+
+| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
+|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+| llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 |
+| llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 |
+| llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 |
+| llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 |
+| llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 |
+| llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 |
+| llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 |
+| llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 |
+| llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 |
+| internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 |
+| internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 |
+| internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 |
+| qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 |
+| qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 |
+| qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 |
+| qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 |
+| qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 |
+| qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 |
+| qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 |
+| qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 |
+| qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 |
+| qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 |
+| qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 |
+| qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 |
+| mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 |
+| mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 |
+| mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 |
+| mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 |
+| yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 |
+| yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 |
+| deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 |
+| deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 |
+
+| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
+|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+| llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 |
+| llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 |
+| llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 |
+| llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 |
+| llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 |
+| llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 |
+| llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 |
+| llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 |
+| llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 |
+| internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 |
+| internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 |
+| internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 |
+| qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 |
+| qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 |
+| qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 |
+| qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 |
+| qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 |
+| qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 |
+| qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 |
+| qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 |
+| qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 |
+| qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 |
+| qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 |
+| qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 |
+| mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 |
+| mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 |
+| mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 |
+| mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 |
+| yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 |
+| yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 |
+| deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 |
+| deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 |
+
+| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
+|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+| llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 |
+| llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 |
+| llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 |
+| llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 |
+| llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 |
+| llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 |
+| llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 |
+| llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 |
+| llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 |
+| internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 |
+| internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 |
+| internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 |
+| qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 |
+| qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 |
+| qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 |
+| qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 |
+| qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 |
+| qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 |
+| qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 |
+| qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 |
+| qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 |
+| qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 |
+| qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 |
+| qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 |
+| mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 |
+| mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 |
+| mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 |
+| mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 |
+| yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 |
+| yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 |
+| deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 |
+| deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 |
+
+## Chat Models
+
+| model | bbh |
+|:-----------------------------:|------:|
+| qwen1.5-0.5b-chat-hf | 24.12 |
+| qwen1.5-1.8b-chat-hf | 26.82 |
+| qwen1.5-4b-chat-hf | 43.15 |
+| qwen1.5-7b-chat-hf | 38.12 |
+| qwen1.5-14b-chat-hf | 55.38 |
+| qwen1.5-32b-chat-hf | 69.28 |
+| qwen1.5-72b-chat-hf | 72.97 |
+| qwen1.5-110b-chat-hf | 71.04 |
+| internlm2-chat-1.8b-hf | 37.69 |
+| internlm2-chat-1.8b-sft-hf | 37.12 |
+| internlm2-chat-7b-hf | 57.83 |
+| internlm2-chat-7b-sft-hf | 57.19 |
+| internlm2-chat-20b-hf | 68.24 |
+| internlm2-chat-20b-sft-hf | 69.38 |
+| llama-3-8b-instruct-hf | 52.85 |
+| llama-3-70b-instruct-hf | 82.42 |
+| llama-3-8b-instruct-lmdeploy | 53.54 |
+| llama-3-70b-instruct-lmdeploy | 82.58 |
+| mistral-7b-instruct-v0.1-hf | 32.88 |
+| mistral-7b-instruct-v0.2-hf | 48.84 |
+| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
+
+### Details
+
+| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
+|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+| qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 |
+| qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 |
+| qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 |
+| qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 |
+| qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 |
+| qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 |
+| qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 |
+| qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 |
+| internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 |
+| internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 |
+| internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 |
+| internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 |
+| internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 |
+| internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 |
+| llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 |
+| llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 |
+| llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 |
+| llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 |
+| mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 |
+| mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 |
+| mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 |
+
+| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
+|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+| qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 |
+| qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 |
+| qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 |
+| qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 |
+| qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 |
+| qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 |
+| qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 |
+| qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 |
+| internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 |
+| internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 |
+| internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 |
+| internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 |
+| internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 |
+| internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 |
+| llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 |
+| llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 |
+| llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 |
+| llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 |
+| mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 |
+| mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 |
+| mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 |
+
+| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
+|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+| qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 |
+| qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 |
+| qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 |
+| qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 |
+| qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 |
+| qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 |
+| qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 |
+| qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 |
+| internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 |
+| internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 |
+| internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 |
+| internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 |
+| internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 |
+| internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 |
+| llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 |
+| llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 |
+| llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 |
+| llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 |
+| mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 |
+| mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 |
+| mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 |
diff --git a/configs/datasets/ceval/README.md b/configs/datasets/ceval/README.md
new file mode 100644
index 00000000..6932b73b
--- /dev/null
+++ b/configs/datasets/ceval/README.md
@@ -0,0 +1,372 @@
+# C-Eval
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
+```
+
+## Base Models
+
+| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
+|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+| llama-7b-turbomind | 26.61 | 27.75 | 27.20 | 26.31 | 25.90 | 26.52 | 27.44 | 27.68 | 27.16 | 29.49 | 24.18 | 29.36 |
+| llama-13b-turbomind | 29.18 | 25.59 | 27.66 | 33.86 | 28.29 | 28.58 | 31.75 | 30.32 | 31.39 | 35.22 | 30.16 | 30.82 |
+| llama-30b-turbomind | 35.09 | 31.68 | 34.56 | 39.89 | 33.02 | 33.76 | 37.70 | 31.97 | 34.80 | 42.72 | 41.19 | 34.93 |
+| llama-65b-turbomind | 37.98 | 29.47 | 36.03 | 45.03 | 36.51 | 36.56 | 40.46 | 33.76 | 36.37 | 46.47 | 42.26 | 40.63 |
+| llama-2-7b-turbomind | 30.13 | 26.26 | 29.29 | 33.02 | 31.02 | 28.15 | 32.70 | 25.85 | 28.75 | 39.75 | 37.04 | 29.13 |
+| llama-2-13b-turbomind | 37.38 | 30.81 | 35.85 | 43.98 | 36.81 | 34.75 | 40.43 | 31.34 | 35.67 | 45.75 | 45.32 | 39.36 |
+| llama-2-70b-turbomind | 49.53 | 33.48 | 44.73 | 60.19 | 50.93 | 47.17 | 50.26 | 32.53 | 44.83 | 59.44 | 54.45 | 47.58 |
+| llama-3-8b-turbomind | 48.83 | 34.47 | 46.02 | 56.48 | 49.15 | 46.69 | 50.45 | 33.76 | 45.94 | 58.08 | 50.93 | 51.25 |
+| llama-3-70b-turbomind | 66.56 | 54.09 | 64.08 | 76.43 | 64.38 | 64.25 | 67.30 | 52.35 | 62.67 | 77.89 | 69.76 | 63.65 |
+| internlm2-1.8b-turbomind | 44.79 | 33.93 | 41.19 | 54.26 | 47.15 | 40.35 | 46.64 | 33.00 | 38.62 | 57.28 | 51.30 | 46.89 |
+| internlm2-7b-turbomind | 63.54 | 45.32 | 58.10 | 76.40 | 66.94 | 58.32 | 64.23 | 40.09 | 54.37 | 76.88 | 70.11 | 64.77 |
+| internlm2-20b-turbomind | 67.28 | 50.15 | 62.33 | 79.59 | 70.55 | 61.82 | 66.73 | 42.50 | 59.25 | 79.98 | 73.43 | 61.56 |
+| qwen-1.8b-turbomind | 54.24 | 38.60 | 50.02 | 68.18 | 55.33 | 48.13 | 53.78 | 33.38 | 46.36 | 68.40 | 57.57 | 50.17 |
+| qwen-7b-turbomind | 62.06 | 42.73 | 56.21 | 77.12 | 65.28 | 55.76 | 63.23 | 36.99 | 54.74 | 78.55 | 68.94 | 59.02 |
+| qwen-14b-turbomind | 70.33 | 53.61 | 65.25 | 83.19 | 72.85 | 65.37 | 72.05 | 55.03 | 66.07 | 85.59 | 74.91 | 67.78 |
+| qwen-72b-turbomind | 83.25 | 66.78 | 78.44 | 91.75 | 83.86 | 83.63 | 83.60 | 63.68 | 78.05 | 90.25 | 87.13 | 84.13 |
+| qwen1.5-0.5b-hf | 48.36 | 35.55 | 44.72 | 62.00 | 48.51 | 42.41 | 50.43 | 37.00 | 46.28 | 62.64 | 48.11 | 49.18 |
+| qwen1.5-1.8b-hf | 58.67 | 40.98 | 53.91 | 74.52 | 58.51 | 53.06 | 59.38 | 43.02 | 53.45 | 75.88 | 60.06 | 54.47 |
+| qwen1.5-4b-hf | 66.55 | 48.50 | 61.45 | 81.12 | 67.90 | 61.22 | 66.46 | 43.12 | 56.76 | 82.89 | 67.61 | 68.03 |
+| qwen1.5-7b-hf | 72.49 | 52.90 | 66.77 | 85.50 | 74.37 | 69.19 | 73.57 | 49.16 | 66.32 | 84.23 | 77.30 | 73.34 |
+| qwen1.5-14b-hf | 76.93 | 60.50 | 72.08 | 88.81 | 77.95 | 73.94 | 77.86 | 54.81 | 71.55 | 86.79 | 82.86 | 76.23 |
+| qwen1.5-32b-hf | 82.50 | 66.67 | 77.97 | 90.93 | 83.66 | 81.88 | 82.79 | 71.06 | 80.01 | 89.02 | 83.36 | 81.62 |
+| qwen1.5-72b-hf | 83.03 | 65.09 | 77.90 | 91.47 | 83.85 | 83.86 | 83.72 | 64.09 | 77.26 | 91.87 | 87.64 | 84.14 |
+| qwen1.5-moe-a2-7b-hf | 76.67 | 51.37 | 68.89 | 88.33 | 77.15 | 79.73 | 77.90 | 51.25 | 67.27 | 89.28 | 83.16 | 81.60 |
+| mistral-7b-v0.1-hf | 43.76 | 33.85 | 42.23 | 49.97 | 41.10 | 43.54 | 47.54 | 33.97 | 44.74 | 54.80 | 51.52 | 42.06 |
+| mistral-7b-v0.2-hf | 42.81 | 32.84 | 41.00 | 50.19 | 39.45 | 42.77 | 46.44 | 31.67 | 42.89 | 54.50 | 48.75 | 43.23 |
+| mixtral-8x7b-v0.1-hf | 51.15 | 41.46 | 50.93 | 59.19 | 46.69 | 48.72 | 55.31 | 42.04 | 52.78 | 62.00 | 56.44 | 52.71 |
+| mixtral-8x22b-v0.1-hf | 58.13 | 48.31 | 58.01 | 66.94 | 53.60 | 54.86 | 60.50 | 45.67 | 57.44 | 71.27 | 61.31 | 55.47 |
+| yi-6b-hf | 70.78 | 43.72 | 60.54 | 83.29 | 75.39 | 73.40 | 73.13 | 46.87 | 63.14 | 85.52 | 78.70 | 74.45 |
+| yi-34b-hf | 80.93 | 58.51 | 73.48 | 89.24 | 83.65 | 84.18 | 81.62 | 56.95 | 71.64 | 89.73 | 87.49 | 86.53 |
+| deepseek-7b-base-hf | 43.68 | 28.90 | 37.03 | 53.55 | 50.14 | 40.34 | 45.07 | 31.94 | 38.81 | 56.68 | 47.10 | 43.85 |
+| deepseek-67b-base-hf | 66.66 | 44.25 | 57.89 | 79.02 | 72.36 | 65.66 | 66.65 | 38.62 | 56.65 | 79.56 | 73.72 | 66.01 |
+
+### Details on Test Split
+
+| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
+|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+| llama-7b-turbomind | 29.82 | 25.70 | 26.94 | 30.99 | 32.95 | 23.66 | 26.01 | 22.89 | 27.45 | 30.09 | 26.48 | 33.13 |
+| llama-13b-turbomind | 33.33 | 37.99 | 31.09 | 29.82 | 22.16 | 27.23 | 31.79 | 27.11 | 24.84 | 28.02 | 33.33 | 30.72 |
+| llama-30b-turbomind | 40.94 | 48.60 | 40.41 | 34.21 | 32.95 | 35.71 | 36.42 | 32.53 | 27.45 | 31.56 | 36.07 | 30.12 |
+| llama-65b-turbomind | 41.52 | 50.84 | 44.04 | 40.94 | 27.84 | 29.46 | 28.32 | 30.72 | 29.41 | 35.10 | 42.47 | 30.12 |
+| llama-2-7b-turbomind | 33.92 | 37.99 | 34.72 | 30.99 | 26.70 | 21.88 | 31.79 | 25.30 | 24.18 | 31.56 | 39.73 | 30.12 |
+| llama-2-13b-turbomind | 40.94 | 46.93 | 37.82 | 36.26 | 30.68 | 29.46 | 35.84 | 30.72 | 24.84 | 32.74 | 42.92 | 34.94 |
+| llama-2-70b-turbomind | 55.56 | 58.66 | 53.89 | 47.95 | 34.09 | 33.48 | 32.95 | 27.11 | 34.64 | 37.76 | 57.99 | 29.52 |
+| llama-3-8b-turbomind | 55.56 | 58.66 | 55.96 | 51.17 | 27.27 | 35.27 | 36.42 | 31.33 | 34.64 | 40.12 | 50.68 | 30.72 |
+| llama-3-70b-turbomind | 69.59 | 75.98 | 69.95 | 71.64 | 49.43 | 58.04 | 52.02 | 53.01 | 58.82 | 45.72 | 68.95 | 40.96 |
+| internlm2-1.8b-turbomind | 40.35 | 40.78 | 39.38 | 32.16 | 34.66 | 34.38 | 31.21 | 31.33 | 35.95 | 35.10 | 51.60 | 27.71 |
+| internlm2-7b-turbomind | 56.14 | 57.54 | 62.69 | 49.42 | 43.75 | 48.21 | 34.68 | 32.53 | 33.33 | 41.00 | 60.27 | 40.36 |
+| internlm2-20b-turbomind | 62.57 | 65.36 | 66.84 | 58.77 | 43.18 | 51.79 | 39.31 | 40.36 | 35.95 | 42.77 | 66.67 | 47.59 |
+| qwen-1.8b-turbomind | 46.20 | 41.90 | 46.63 | 36.84 | 40.34 | 36.61 | 27.75 | 28.92 | 32.68 | 36.58 | 57.08 | 30.12 |
+| qwen-7b-turbomind | 52.63 | 54.75 | 54.40 | 46.20 | 35.80 | 44.20 | 36.99 | 27.71 | 26.80 | 38.35 | 57.99 | 33.13 |
+| qwen-14b-turbomind | 58.48 | 64.80 | 59.07 | 54.68 | 45.45 | 57.59 | 45.09 | 33.73 | 39.22 | 49.26 | 67.58 | 45.78 |
+| qwen-72b-turbomind | 83.04 | 73.74 | 79.27 | 76.61 | 75.00 | 64.29 | 49.13 | 44.58 | 46.41 | 66.37 | 85.84 | 68.07 |
+| qwen1.5-0.5b-hf | 37.43 | 40.22 | 41.45 | 35.09 | 40.91 | 34.82 | 30.06 | 27.11 | 26.80 | 29.79 | 54.34 | 31.93 |
+| qwen1.5-1.8b-hf | 47.37 | 50.84 | 47.67 | 38.30 | 43.18 | 35.27 | 29.48 | 30.12 | 33.99 | 39.53 | 58.90 | 28.92 |
+| qwen1.5-4b-hf | 62.57 | 56.98 | 56.99 | 46.78 | 48.30 | 45.98 | 40.46 | 34.34 | 31.37 | 46.61 | 62.10 | 43.37 |
+| qwen1.5-7b-hf | 66.08 | 62.57 | 66.32 | 55.56 | 54.55 | 47.77 | 41.62 | 31.93 | 35.95 | 49.85 | 74.43 | 49.40 |
+| qwen1.5-14b-hf | 71.35 | 66.48 | 68.39 | 64.91 | 57.95 | 65.62 | 41.62 | 40.36 | 47.71 | 56.64 | 79.45 | 56.63 |
+| qwen1.5-32b-hf | 84.80 | 73.18 | 74.61 | 70.18 | 71.59 | 61.61 | 49.13 | 45.78 | 49.02 | 61.95 | 87.67 | 72.89 |
+| qwen1.5-72b-hf | 85.38 | 73.74 | 78.24 | 78.36 | 72.73 | 63.39 | 43.35 | 40.96 | 49.02 | 65.78 | 85.84 | 66.27 |
+| qwen1.5-moe-a2-7b-hf | 77.78 | 73.74 | 68.91 | 64.91 | 66.48 | 49.11 | 33.53 | 36.75 | 35.95 | 61.06 | 91.32 | 40.96 |
+| mistral-7b-v0.1-hf | 55.56 | 55.31 | 56.99 | 48.25 | 39.77 | 39.29 | 33.53 | 25.90 | 31.37 | 35.99 | 45.21 | 27.11 |
+| mistral-7b-v0.2-hf | 56.14 | 53.63 | 55.44 | 47.66 | 36.36 | 34.38 | 32.37 | 25.30 | 33.33 | 31.86 | 45.21 | 29.52 |
+| mixtral-8x7b-v0.1-hf | 62.57 | 64.80 | 60.10 | 60.53 | 38.64 | 42.41 | 40.46 | 37.35 | 45.75 | 35.99 | 60.27 | 34.94 |
+| mixtral-8x22b-v0.1-hf | 65.50 | 74.86 | 63.73 | 65.79 | 46.59 | 52.68 | 52.02 | 45.78 | 52.94 | 42.77 | 62.56 | 39.16 |
+| yi-6b-hf | 68.42 | 63.13 | 69.43 | 57.89 | 42.05 | 48.66 | 31.79 | 33.13 | 28.76 | 49.85 | 74.89 | 37.35 |
+| yi-34b-hf | 83.63 | 80.45 | 74.09 | 68.42 | 62.50 | 60.27 | 45.09 | 38.55 | 50.33 | 65.19 | 88.58 | 49.40 |
+| deepseek-7b-base-hf | 44.44 | 44.13 | 44.56 | 36.26 | 30.68 | 29.02 | 32.37 | 24.70 | 26.14 | 35.99 | 48.86 | 28.31 |
+| deepseek-67b-base-hf | 63.16 | 70.39 | 65.80 | 59.36 | 42.61 | 45.54 | 35.84 | 38.55 | 42.48 | 44.54 | 68.95 | 33.73 |
+
+| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
+|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+| llama-7b-turbomind | 29.14 | 26.74 | 24.57 | 29.94 | 22.92 | 23.60 | 20.00 | 30.95 | 29.98 | 24.58 | 25.70 | 25.11 |
+| llama-13b-turbomind | 22.29 | 18.60 | 28.00 | 26.55 | 26.56 | 25.28 | 19.46 | 29.05 | 28.77 | 28.57 | 39.66 | 43.38 |
+| llama-30b-turbomind | 25.14 | 33.14 | 36.00 | 31.07 | 39.06 | 28.09 | 33.51 | 38.10 | 35.21 | 35.88 | 48.04 | 33.33 |
+| llama-65b-turbomind | 33.71 | 26.16 | 38.29 | 33.90 | 44.27 | 36.52 | 38.92 | 38.10 | 37.42 | 42.19 | 59.22 | 48.40 |
+| llama-2-7b-turbomind | 26.86 | 23.26 | 26.86 | 28.81 | 28.12 | 29.78 | 22.70 | 30.48 | 31.79 | 30.56 | 33.52 | 36.07 |
+| llama-2-13b-turbomind | 28.00 | 31.98 | 36.57 | 36.72 | 38.54 | 36.52 | 37.84 | 46.67 | 37.02 | 36.54 | 57.54 | 41.10 |
+| llama-2-70b-turbomind | 40.00 | 36.05 | 48.00 | 36.72 | 66.67 | 55.06 | 55.68 | 52.86 | 51.91 | 48.50 | 68.16 | 60.73 |
+| llama-3-8b-turbomind | 41.71 | 38.37 | 50.86 | 36.16 | 61.98 | 63.48 | 63.78 | 56.19 | 41.65 | 49.17 | 69.27 | 54.34 |
+| llama-3-70b-turbomind | 63.43 | 56.98 | 69.14 | 59.32 | 84.90 | 75.28 | 78.92 | 79.52 | 68.81 | 59.80 | 86.59 | 79.91 |
+| internlm2-1.8b-turbomind | 30.29 | 45.93 | 46.29 | 33.33 | 63.02 | 60.11 | 62.70 | 47.62 | 35.61 | 37.87 | 69.27 | 61.64 |
+| internlm2-7b-turbomind | 64.57 | 65.12 | 76.00 | 54.80 | 91.15 | 85.96 | 90.27 | 74.29 | 57.34 | 50.50 | 86.59 | 83.56 |
+| internlm2-20b-turbomind | 68.57 | 74.42 | 78.86 | 58.76 | 91.67 | 90.45 | 90.27 | 72.38 | 57.95 | 55.81 | 88.83 | 88.58 |
+| qwen-1.8b-turbomind | 55.43 | 56.98 | 61.14 | 54.80 | 85.42 | 84.83 | 85.41 | 54.76 | 43.06 | 44.19 | 83.80 | 79.91 |
+| qwen-7b-turbomind | 68.00 | 69.19 | 82.86 | 57.63 | 93.75 | 87.64 | 92.43 | 63.81 | 47.28 | 57.48 | 86.59 | 82.65 |
+| qwen-14b-turbomind | 78.86 | 83.14 | 92.57 | 67.23 | 96.88 | 95.51 | 96.76 | 73.33 | 56.94 | 64.45 | 91.62 | 86.76 |
+| qwen-72b-turbomind | 93.14 | 93.60 | 95.43 | 88.70 | 98.44 | 97.75 | 99.46 | 90.00 | 75.45 | 80.73 | 96.09 | 99.54 |
+| qwen1.5-0.5b-hf | 48.57 | 44.19 | 60.00 | 40.68 | 73.44 | 69.66 | 78.92 | 49.05 | 34.41 | 40.20 | 79.89 | 74.43 |
+| qwen1.5-1.8b-hf | 58.86 | 68.02 | 76.00 | 59.32 | 91.15 | 90.45 | 87.03 | 63.81 | 44.87 | 48.50 | 86.03 | 90.41 |
+| qwen1.5-4b-hf | 66.86 | 77.33 | 82.86 | 68.93 | 95.31 | 92.70 | 97.30 | 71.90 | 51.31 | 61.13 | 91.62 | 94.52 |
+| qwen1.5-7b-hf | 79.43 | 82.56 | 91.43 | 77.40 | 96.88 | 95.51 | 96.22 | 80.00 | 62.37 | 69.77 | 93.30 | 97.26 |
+| qwen1.5-14b-hf | 86.29 | 87.79 | 93.14 | 83.05 | 97.92 | 95.51 | 97.84 | 82.86 | 63.78 | 77.08 | 95.53 | 96.35 |
+| qwen1.5-32b-hf | 88.00 | 95.35 | 94.86 | 91.53 | 97.92 | 99.44 | 100.00 | 90.00 | 73.44 | 78.74 | 94.97 | 98.63 |
+| qwen1.5-72b-hf | 91.43 | 93.60 | 95.43 | 88.70 | 97.92 | 98.31 | 99.46 | 90.00 | 74.25 | 80.40 | 94.41 | 98.63 |
+| qwen1.5-moe-a2-7b-hf | 70.86 | 77.33 | 82.86 | 68.36 | 97.92 | 93.26 | 97.30 | 89.52 | 70.22 | 74.75 | 96.09 | 98.17 |
+| mistral-7b-v0.1-hf | 33.14 | 40.70 | 40.57 | 40.11 | 47.92 | 49.44 | 50.81 | 47.62 | 44.87 | 37.87 | 58.10 | 48.40 |
+| mistral-7b-v0.2-hf | 34.86 | 36.63 | 45.71 | 36.72 | 46.35 | 46.07 | 48.65 | 43.81 | 43.46 | 39.53 | 57.54 | 48.86 |
+| mixtral-8x7b-v0.1-hf | 49.71 | 42.44 | 53.71 | 47.46 | 62.50 | 61.24 | 60.00 | 57.62 | 52.52 | 44.52 | 68.72 | 57.99 |
+| mixtral-8x22b-v0.1-hf | 54.29 | 43.02 | 58.29 | 55.93 | 76.04 | 66.29 | 75.68 | 66.19 | 60.97 | 51.83 | 74.30 | 70.78 |
+| yi-6b-hf | 58.86 | 69.19 | 78.29 | 43.50 | 92.19 | 89.33 | 90.27 | 83.81 | 59.56 | 70.10 | 93.85 | 97.72 |
+| yi-34b-hf | 80.00 | 81.98 | 93.14 | 65.54 | 97.40 | 95.51 | 96.76 | 92.86 | 74.04 | 76.08 | 94.97 | 97.26 |
+| deepseek-7b-base-hf | 29.14 | 30.81 | 33.14 | 24.29 | 53.12 | 45.51 | 48.65 | 50.48 | 38.23 | 44.19 | 62.01 | 65.30 |
+| deepseek-67b-base-hf | 60.00 | 55.23 | 64.00 | 46.33 | 84.90 | 79.78 | 83.24 | 73.33 | 57.75 | 63.79 | 89.94 | 88.58 |
+
+| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
+|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+| llama-7b-turbomind | 22.96 | 31.58 | 25.57 | 29.78 | 22.80 | 25.00 | 21.70 | 21.51 | 25.00 | 26.24 | 22.49 | 25.84 |
+| llama-13b-turbomind | 29.26 | 30.83 | 33.52 | 36.52 | 34.72 | 33.33 | 24.06 | 40.12 | 26.47 | 33.48 | 30.14 | 29.87 |
+| llama-30b-turbomind | 37.41 | 46.37 | 32.95 | 38.20 | 50.78 | 40.74 | 28.77 | 45.93 | 33.33 | 32.13 | 39.23 | 22.82 |
+| llama-65b-turbomind | 39.63 | 51.13 | 31.82 | 39.89 | 58.03 | 42.59 | 34.91 | 55.23 | 39.71 | 30.32 | 37.80 | 32.89 |
+| llama-2-7b-turbomind | 27.78 | 34.34 | 31.82 | 34.83 | 35.23 | 34.26 | 28.77 | 38.95 | 32.35 | 33.94 | 27.27 | 30.87 |
+| llama-2-13b-turbomind | 41.48 | 47.37 | 37.50 | 37.64 | 50.78 | 52.78 | 43.40 | 48.84 | 32.35 | 38.46 | 36.36 | 30.20 |
+| llama-2-70b-turbomind | 57.78 | 69.17 | 50.57 | 58.43 | 69.95 | 66.67 | 50.94 | 72.09 | 50.98 | 42.53 | 44.98 | 52.01 |
+| llama-3-8b-turbomind | 56.30 | 65.41 | 47.16 | 56.18 | 64.25 | 61.11 | 55.66 | 67.44 | 41.67 | 40.27 | 45.45 | 50.34 |
+| llama-3-70b-turbomind | 72.22 | 85.46 | 75.00 | 74.72 | 84.97 | 76.85 | 75.00 | 76.16 | 59.31 | 52.94 | 62.68 | 68.46 |
+| internlm2-1.8b-turbomind | 47.41 | 61.40 | 55.11 | 47.75 | 61.66 | 64.81 | 61.79 | 63.95 | 32.35 | 32.58 | 48.33 | 36.58 |
+| internlm2-7b-turbomind | 66.67 | 85.96 | 78.98 | 74.72 | 91.71 | 87.96 | 80.66 | 80.23 | 42.16 | 50.23 | 64.11 | 70.13 |
+| internlm2-20b-turbomind | 69.26 | 89.22 | 83.52 | 80.34 | 90.67 | 91.67 | 83.02 | 85.47 | 49.02 | 54.30 | 72.25 | 73.15 |
+| qwen-1.8b-turbomind | 51.11 | 70.68 | 71.02 | 62.36 | 88.60 | 87.04 | 69.81 | 73.26 | 29.90 | 46.15 | 50.24 | 47.32 |
+| qwen-7b-turbomind | 57.41 | 83.71 | 88.64 | 79.78 | 93.26 | 94.44 | 75.47 | 79.07 | 42.16 | 47.96 | 59.33 | 65.10 |
+| qwen-14b-turbomind | 72.96 | 89.97 | 93.75 | 83.71 | 96.37 | 95.37 | 86.32 | 87.21 | 50.00 | 60.63 | 66.99 | 72.48 |
+| qwen-72b-turbomind | 85.56 | 96.24 | 95.45 | 93.26 | 97.93 | 97.22 | 92.45 | 91.86 | 67.65 | 76.92 | 75.12 | 83.89 |
+| qwen1.5-0.5b-hf | 43.33 | 63.16 | 65.91 | 56.18 | 82.90 | 79.63 | 68.87 | 70.35 | 28.43 | 37.56 | 39.23 | 32.21 |
+| qwen1.5-1.8b-hf | 57.41 | 76.44 | 81.25 | 75.84 | 92.75 | 91.67 | 79.72 | 81.98 | 34.31 | 47.96 | 47.85 | 43.62 |
+| qwen1.5-4b-hf | 65.93 | 87.47 | 86.93 | 82.58 | 94.30 | 95.37 | 84.91 | 84.30 | 40.20 | 62.90 | 58.85 | 58.72 |
+| qwen1.5-7b-hf | 69.26 | 91.98 | 90.91 | 89.89 | 95.85 | 94.44 | 89.15 | 87.21 | 48.04 | 67.87 | 63.16 | 68.12 |
+| qwen1.5-14b-hf | 78.89 | 94.99 | 94.89 | 91.57 | 96.89 | 98.15 | 91.04 | 88.37 | 57.84 | 69.68 | 66.99 | 73.83 |
+| qwen1.5-32b-hf | 83.70 | 95.99 | 93.75 | 94.38 | 98.45 | 97.22 | 90.57 | 91.28 | 70.10 | 76.92 | 76.56 | 80.87 |
+| qwen1.5-72b-hf | 84.44 | 96.49 | 96.59 | 93.82 | 98.45 | 97.22 | 92.92 | 91.28 | 66.67 | 76.92 | 74.16 | 85.23 |
+| qwen1.5-moe-a2-7b-hf | 80.74 | 95.49 | 89.20 | 89.33 | 94.82 | 94.44 | 92.45 | 91.28 | 52.45 | 75.57 | 67.94 | 79.87 |
+| mistral-7b-v0.1-hf | 45.19 | 59.15 | 43.75 | 49.44 | 56.48 | 56.48 | 45.28 | 58.14 | 37.75 | 38.91 | 40.67 | 34.56 |
+| mistral-7b-v0.2-hf | 45.93 | 58.65 | 38.07 | 48.31 | 63.21 | 58.33 | 41.98 | 54.07 | 35.78 | 40.27 | 38.28 | 32.21 |
+| mixtral-8x7b-v0.1-hf | 57.04 | 67.92 | 53.41 | 55.06 | 69.95 | 64.81 | 47.64 | 70.93 | 42.16 | 38.01 | 46.41 | 36.58 |
+| mixtral-8x22b-v0.1-hf | 60.37 | 72.68 | 64.77 | 65.17 | 77.20 | 71.30 | 57.08 | 75.00 | 49.51 | 43.44 | 52.63 | 49.33 |
+| yi-6b-hf | 79.26 | 92.48 | 77.27 | 76.40 | 92.75 | 93.52 | 89.15 | 90.12 | 60.78 | 74.66 | 61.24 | 74.16 |
+| yi-34b-hf | 84.81 | 96.24 | 88.07 | 88.20 | 96.37 | 96.30 | 91.98 | 91.28 | 75.00 | 78.73 | 80.38 | 82.89 |
+| deepseek-7b-base-hf | 52.22 | 70.18 | 47.16 | 51.12 | 60.62 | 44.44 | 58.49 | 66.86 | 31.86 | 37.56 | 53.11 | 61.07 |
+| deepseek-67b-base-hf | 76.67 | 89.22 | 77.27 | 78.65 | 89.64 | 78.70 | 85.85 | 84.30 | 50.00 | 64.25 | 69.38 | 84.23 |
+
+| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
+|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+| llama-7b-turbomind | 29.70 | 23.72 | 27.53 | 30.22 | 30.92 | 27.04 | 22.78 | 28.64 | 28.00 | 25.00 | 26.32 | 29.80 |
+| llama-13b-turbomind | 25.94 | 20.93 | 25.84 | 29.67 | 24.64 | 29.60 | 26.67 | 29.15 | 33.71 | 25.50 | 28.47 | 28.44 |
+| llama-30b-turbomind | 29.32 | 27.91 | 30.34 | 36.26 | 37.20 | 36.13 | 36.11 | 38.69 | 34.29 | 29.50 | 38.52 | 29.35 |
+| llama-65b-turbomind | 28.95 | 30.70 | 30.90 | 44.51 | 35.75 | 36.60 | 45.56 | 39.20 | 37.71 | 30.00 | 39.47 | 37.02 |
+| llama-2-7b-turbomind | 29.70 | 30.23 | 24.72 | 29.67 | 34.78 | 30.07 | 31.11 | 31.16 | 30.29 | 25.50 | 31.34 | 27.31 |
+| llama-2-13b-turbomind | 30.83 | 32.56 | 24.16 | 42.31 | 45.41 | 32.87 | 36.67 | 45.23 | 38.29 | 33.50 | 35.17 | 34.31 |
+| llama-2-70b-turbomind | 53.76 | 38.14 | 30.34 | 58.79 | 65.70 | 43.82 | 51.11 | 58.29 | 49.71 | 42.00 | 49.76 | 46.28 |
+| llama-3-8b-turbomind | 52.63 | 42.33 | 27.53 | 51.65 | 65.70 | 44.52 | 54.44 | 51.26 | 46.86 | 43.00 | 46.41 | 45.15 |
+| llama-3-70b-turbomind | 72.93 | 52.56 | 32.58 | 71.98 | 83.57 | 56.88 | 69.44 | 78.89 | 76.00 | 67.50 | 57.89 | 59.14 |
+| internlm2-1.8b-turbomind | 51.50 | 38.14 | 25.84 | 56.04 | 71.50 | 47.32 | 35.00 | 43.72 | 42.29 | 39.00 | 41.15 | 36.57 |
+| internlm2-7b-turbomind | 72.56 | 53.49 | 52.25 | 79.67 | 90.82 | 62.00 | 62.78 | 64.32 | 66.86 | 59.50 | 55.74 | 53.50 |
+| internlm2-20b-turbomind | 74.06 | 54.42 | 56.18 | 81.87 | 92.27 | 61.77 | 68.33 | 69.85 | 68.00 | 63.50 | 60.77 | 58.92 |
+| qwen-1.8b-turbomind | 54.14 | 43.72 | 39.89 | 69.23 | 85.02 | 49.88 | 45.56 | 48.74 | 48.57 | 51.50 | 46.89 | 45.82 |
+| qwen-7b-turbomind | 71.05 | 48.37 | 53.93 | 81.87 | 93.72 | 59.67 | 54.44 | 62.31 | 58.29 | 57.50 | 50.24 | 56.66 |
+| qwen-14b-turbomind | 79.70 | 53.02 | 63.48 | 87.36 | 94.20 | 71.33 | 63.33 | 71.36 | 73.14 | 68.00 | 59.09 | 67.95 |
+| qwen-72b-turbomind | 90.23 | 77.21 | 79.21 | 91.76 | 96.14 | 77.86 | 86.11 | 85.43 | 91.43 | 90.50 | 76.08 | 86.68 |
+| qwen1.5-0.5b-hf | 44.36 | 36.74 | 39.33 | 58.24 | 78.26 | 43.36 | 40.00 | 45.23 | 41.71 | 42.50 | 43.54 | 43.12 |
+| qwen1.5-1.8b-hf | 59.40 | 47.91 | 37.08 | 72.53 | 91.30 | 53.61 | 53.33 | 51.26 | 49.71 | 58.00 | 51.20 | 56.21 |
+| qwen1.5-4b-hf | 65.04 | 58.60 | 55.62 | 83.52 | 94.20 | 62.00 | 63.89 | 65.33 | 65.71 | 64.00 | 55.26 | 61.40 |
+| qwen1.5-7b-hf | 78.57 | 66.51 | 66.85 | 87.91 | 94.69 | 68.07 | 65.00 | 64.82 | 77.14 | 77.50 | 60.77 | 74.49 |
+| qwen1.5-14b-hf | 83.08 | 72.09 | 70.22 | 90.11 | 94.20 | 69.46 | 73.89 | 70.35 | 82.29 | 83.00 | 65.31 | 78.33 |
+| qwen1.5-32b-hf | 87.59 | 78.14 | 79.78 | 92.86 | 95.65 | 78.32 | 80.56 | 79.90 | 90.29 | 89.00 | 77.27 | 86.68 |
+| qwen1.5-72b-hf | 91.35 | 76.74 | 79.21 | 91.76 | 96.14 | 79.25 | 85.56 | 86.93 | 92.00 | 90.00 | 75.84 | 86.91 |
+| qwen1.5-moe-a2-7b-hf | 88.35 | 75.81 | 51.12 | 79.12 | 94.69 | 67.37 | 80.56 | 73.37 | 87.43 | 84.00 | 78.23 | 82.39 |
+| mistral-7b-v0.1-hf | 40.23 | 39.07 | 24.16 | 41.21 | 52.17 | 41.49 | 45.00 | 52.26 | 45.14 | 42.00 | 42.58 | 44.02 |
+| mistral-7b-v0.2-hf | 36.84 | 34.88 | 23.03 | 43.96 | 52.66 | 40.79 | 50.00 | 50.75 | 45.14 | 40.50 | 42.58 | 40.86 |
+| mixtral-8x7b-v0.1-hf | 47.74 | 40.00 | 28.09 | 57.14 | 58.94 | 44.29 | 58.33 | 53.77 | 48.57 | 46.00 | 51.20 | 46.50 |
+| mixtral-8x22b-v0.1-hf | 59.02 | 41.86 | 29.78 | 60.99 | 71.01 | 50.82 | 57.78 | 67.34 | 62.29 | 52.00 | 53.35 | 55.98 |
+| yi-6b-hf | 85.34 | 67.91 | 53.93 | 80.22 | 91.79 | 65.97 | 72.22 | 72.36 | 82.29 | 84.50 | 69.86 | 71.56 |
+| yi-34b-hf | 94.36 | 76.74 | 65.73 | 87.91 | 95.17 | 79.25 | 85.56 | 90.95 | 90.86 | 92.00 | 76.79 | 82.39 |
+| deepseek-7b-base-hf | 65.79 | 29.30 | 32.58 | 47.80 | 67.15 | 37.76 | 44.44 | 52.26 | 43.43 | 36.50 | 41.15 | 37.02 |
+| deepseek-67b-base-hf | 83.83 | 58.60 | 45.51 | 79.67 | 90.34 | 62.47 | 70.56 | 70.85 | 81.14 | 71.50 | 61.72 | 60.05 |
+
+| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
+|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+| llama-7b-turbomind | 22.34 | 24.91 | 29.12 | 27.77 |
+| llama-13b-turbomind | 24.11 | 30.25 | 27.77 | 30.70 |
+| llama-30b-turbomind | 28.72 | 31.67 | 31.83 | 36.57 |
+| llama-65b-turbomind | 28.37 | 39.15 | 33.63 | 35.44 |
+| llama-2-7b-turbomind | 22.70 | 24.91 | 25.51 | 29.80 |
+| llama-2-13b-turbomind | 25.53 | 35.94 | 29.35 | 35.44 |
+| llama-2-70b-turbomind | 36.52 | 52.67 | 36.12 | 52.60 |
+| llama-3-8b-turbomind | 35.46 | 49.82 | 41.31 | 55.30 |
+| llama-3-70b-turbomind | 48.58 | 64.41 | 52.60 | 75.40 |
+| internlm2-1.8b-turbomind | 32.27 | 42.35 | 39.05 | 45.15 |
+| internlm2-7b-turbomind | 46.81 | 55.16 | 47.63 | 67.27 |
+| internlm2-20b-turbomind | 45.04 | 62.63 | 51.47 | 69.75 |
+| qwen-1.8b-turbomind | 41.84 | 47.69 | 45.60 | 57.34 |
+| qwen-7b-turbomind | 41.84 | 54.80 | 48.08 | 69.53 |
+| qwen-14b-turbomind | 45.74 | 64.77 | 56.43 | 77.88 |
+| qwen-72b-turbomind | 80.50 | 74.73 | 81.04 | 89.62 |
+| qwen1.5-0.5b-hf | 39.36 | 41.28 | 38.37 | 48.08 |
+| qwen1.5-1.8b-hf | 45.74 | 49.47 | 51.69 | 63.43 |
+| qwen1.5-4b-hf | 50.35 | 51.60 | 58.69 | 75.17 |
+| qwen1.5-7b-hf | 58.51 | 65.84 | 67.04 | 81.94 |
+| qwen1.5-14b-hf | 63.83 | 67.26 | 72.23 | 87.36 |
+| qwen1.5-32b-hf | 74.47 | 73.31 | 80.14 | 90.74 |
+| qwen1.5-72b-hf | 79.79 | 75.09 | 81.04 | 90.07 |
+| qwen1.5-moe-a2-7b-hf | 74.82 | 77.58 | 79.68 | 91.65 |
+| mistral-7b-v0.1-hf | 32.27 | 45.91 | 37.70 | 50.56 |
+| mistral-7b-v0.2-hf | 32.62 | 44.13 | 36.79 | 46.28 |
+| mixtral-8x7b-v0.1-hf | 35.11 | 53.02 | 46.73 | 52.37 |
+| mixtral-8x22b-v0.1-hf | 38.65 | 56.23 | 49.21 | 59.82 |
+| yi-6b-hf | 67.38 | 68.68 | 69.53 | 83.07 |
+| yi-34b-hf | 77.66 | 83.27 | 77.43 | 89.84 |
+| deepseek-7b-base-hf | 30.50 | 38.79 | 35.67 | 46.28 |
+| deepseek-67b-base-hf | 46.81 | 65.12 | 54.40 | 77.65 |
+
+### Details on Dev Split
+
+## Chat Models
+
+| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
+|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+| qwen1.5-0.5b-chat-hf | 36.88 | 28.83 | 34.49 | 43.46 | 37.35 | 34.76 | 38.58 | 33.90 | 33.63 | 43.81 | 41.79 | 39.59 |
+| qwen1.5-1.8b-chat-hf | 55.17 | 38.21 | 50.63 | 70.26 | 56.04 | 48.82 | 55.93 | 37.60 | 50.31 | 67.59 | 60.90 | 50.59 |
+| qwen1.5-4b-chat-hf | 61.54 | 44.79 | 56.86 | 75.84 | 62.13 | 56.46 | 62.76 | 38.32 | 55.39 | 79.53 | 65.67 | 58.00 |
+| qwen1.5-7b-chat-hf | 68.71 | 51.77 | 64.27 | 81.23 | 68.22 | 65.88 | 71.10 | 50.13 | 65.42 | 83.99 | 73.77 | 67.02 |
+| qwen1.5-14b-chat-hf | 74.80 | 56.54 | 69.46 | 87.47 | 76.46 | 71.32 | 76.35 | 52.08 | 69.68 | 86.70 | 80.56 | 74.87 |
+| qwen1.5-32b-chat-hf | 80.47 | 63.17 | 75.66 | 89.58 | 81.98 | 79.43 | 81.27 | 63.51 | 76.64 | 89.39 | 82.97 | 80.59 |
+| qwen1.5-72b-chat-hf | 81.53 | 63.62 | 75.86 | 90.74 | 83.18 | 81.84 | 82.88 | 62.44 | 77.54 | 89.80 | 86.11 | 83.07 |
+| qwen1.5-110b-chat-hf | 87.33 | 67.27 | 80.70 | 93.58 | 89.67 | 91.35 | 87.59 | 73.64 | 81.94 | 91.47 | 92.12 | 89.80 |
+| internlm2-chat-1.8b-hf | 47.04 | 34.81 | 43.28 | 59.34 | 48.24 | 41.50 | 48.51 | 36.75 | 42.23 | 57.79 | 54.83 | 45.15 |
+| internlm2-chat-1.8b-sft-hf | 47.19 | 35.34 | 43.49 | 59.56 | 48.30 | 41.58 | 48.75 | 35.83 | 42.04 | 59.80 | 54.84 | 44.83 |
+| internlm2-chat-7b-hf | 58.75 | 39.61 | 52.38 | 71.46 | 61.57 | 55.96 | 61.04 | 36.56 | 51.81 | 74.01 | 69.13 | 57.92 |
+| internlm2-chat-7b-sft-hf | 58.96 | 40.09 | 52.40 | 71.49 | 62.20 | 56.26 | 61.02 | 37.29 | 52.60 | 74.01 | 68.27 | 57.27 |
+| internlm2-chat-20b-hf | 63.12 | 42.65 | 56.21 | 75.64 | 67.15 | 60.27 | 63.45 | 34.96 | 52.84 | 79.27 | 71.50 | 60.32 |
+| internlm2-chat-20b-sft-hf | 63.16 | 42.70 | 56.19 | 75.74 | 67.20 | 60.37 | 63.54 | 34.96 | 52.57 | 80.33 | 71.42 | 60.34 |
+| llama-3-8b-instruct-hf | 50.90 | 34.54 | 46.73 | 58.73 | 49.24 | 53.04 | 52.55 | 36.37 | 48.47 | 58.03 | 53.26 | 54.26 |
+| llama-3-70b-instruct-hf | 67.38 | 54.02 | 65.16 | 76.83 | 62.29 | 67.92 | 67.92 | 54.50 | 66.85 | 76.80 | 65.98 | 63.72 |
+| llama-3-8b-instruct-lmdeploy | 49.92 | 34.75 | 46.19 | 58.49 | 47.68 | 51.14 | 50.27 | 33.32 | 46.25 | 56.93 | 49.02 | 52.76 |
+| llama-3-70b-instruct-lmdeploy | 66.41 | 52.76 | 64.72 | 75.31 | 61.36 | 66.44 | 68.21 | 52.28 | 65.86 | 75.06 | 68.37 | 66.09 |
+| mistral-7b-instruct-v0.1-hf | 36.76 | 27.76 | 35.55 | 42.41 | 34.45 | 36.12 | 40.04 | 30.21 | 35.77 | 45.15 | 40.99 | 42.22 |
+| mistral-7b-instruct-v0.2-hf | 40.38 | 30.26 | 38.82 | 47.66 | 37.08 | 39.91 | 43.00 | 25.97 | 38.60 | 47.44 | 48.15 | 41.82 |
+| mixtral-8x7b-instruct-v0.1-hf | 49.61 | 37.78 | 47.86 | 58.56 | 46.40 | 47.85 | 51.68 | 37.41 | 49.14 | 59.79 | 52.97 | 47.65 |
+
+### Details on Test Split
+
+| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics |
+|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+| qwen1.5-0.5b-chat-hf | 35.67 | 36.87 | 33.68 | 33.92 | 35.23 | 28.12 | 27.17 | 26.51 | 24.84 | 28.91 | 40.18 | 25.90 |
+| qwen1.5-1.8b-chat-hf | 46.78 | 47.49 | 50.78 | 39.18 | 41.48 | 31.25 | 32.95 | 27.71 | 28.10 | 34.81 | 55.71 | 27.11 |
+| qwen1.5-4b-chat-hf | 54.39 | 54.75 | 54.92 | 44.74 | 46.02 | 43.30 | 39.31 | 31.33 | 28.10 | 45.13 | 58.90 | 43.98 |
+| qwen1.5-7b-chat-hf | 60.82 | 60.34 | 63.21 | 55.85 | 48.86 | 45.09 | 46.24 | 36.14 | 39.22 | 47.49 | 70.32 | 45.78 |
+| qwen1.5-14b-chat-hf | 69.59 | 62.57 | 64.77 | 64.91 | 55.68 | 57.14 | 49.13 | 32.53 | 43.14 | 55.16 | 76.71 | 46.99 |
+| qwen1.5-32b-chat-hf | 81.87 | 74.30 | 73.58 | 71.35 | 63.07 | 60.71 | 50.87 | 46.99 | 47.06 | 59.29 | 83.11 | 60.84 |
+| qwen1.5-72b-chat-hf | 77.78 | 75.42 | 76.17 | 73.39 | 63.64 | 62.50 | 45.09 | 45.78 | 48.37 | 59.00 | 81.74 | 60.84 |
+| qwen1.5-110b-chat-hf | 83.63 | 86.03 | 81.87 | 77.49 | 76.70 | 67.86 | 49.13 | 47.59 | 55.56 | 79.94 | 95.89 | 62.05 |
+| internlm2-chat-1.8b-hf | 42.11 | 43.58 | 44.56 | 35.38 | 32.95 | 34.82 | 32.95 | 28.92 | 32.68 | 34.22 | 53.42 | 31.93 |
+| internlm2-chat-1.8b-sft-hf | 42.11 | 44.13 | 43.01 | 35.09 | 34.09 | 36.16 | 32.95 | 27.11 | 33.33 | 35.10 | 51.14 | 33.13 |
+| internlm2-chat-7b-hf | 59.65 | 60.89 | 58.03 | 51.46 | 36.93 | 43.75 | 36.99 | 29.52 | 36.60 | 39.82 | 63.47 | 38.55 |
+| internlm2-chat-7b-sft-hf | 59.06 | 61.45 | 56.48 | 52.63 | 39.77 | 41.52 | 36.99 | 27.71 | 39.22 | 40.12 | 62.10 | 40.36 |
+| internlm2-chat-20b-hf | 61.99 | 70.39 | 63.73 | 54.97 | 33.52 | 47.77 | 43.93 | 40.96 | 44.44 | 44.25 | 61.64 | 34.34 |
+| internlm2-chat-20b-sft-hf | 61.40 | 70.39 | 63.21 | 54.97 | 32.95 | 47.77 | 42.20 | 42.17 | 43.14 | 44.25 | 61.64 | 32.53 |
+| llama-3-8b-instruct-hf | 57.31 | 58.10 | 57.51 | 51.17 | 28.41 | 35.27 | 39.31 | 32.53 | 35.29 | 38.05 | 55.25 | 27.11 |
+| llama-3-70b-instruct-hf | 71.93 | 74.86 | 70.98 | 67.54 | 50.57 | 57.14 | 52.60 | 53.01 | 56.21 | 47.79 | 68.95 | 43.98 |
+| llama-3-8b-instruct-lmdeploy | 55.56 | 57.54 | 55.44 | 48.25 | 30.11 | 33.04 | 35.84 | 31.33 | 33.33 | 38.94 | 53.88 | 31.93 |
+| llama-3-70b-instruct-lmdeploy | 70.76 | 77.09 | 69.95 | 67.84 | 49.43 | 54.02 | 50.87 | 54.22 | 56.21 | 47.20 | 69.86 | 42.17 |
+| mistral-7b-instruct-v0.1-hf | 49.12 | 47.49 | 43.52 | 39.18 | 32.39 | 28.57 | 29.48 | 24.10 | 28.10 | 37.46 | 44.29 | 23.49 |
+| mistral-7b-instruct-v0.2-hf | 47.95 | 53.07 | 52.85 | 42.69 | 28.41 | 26.79 | 40.46 | 30.12 | 29.41 | 33.33 | 42.92 | 24.10 |
+| mixtral-8x7b-instruct-v0.1-hf | 58.48 | 62.57 | 58.03 | 56.43 | 38.64 | 36.16 | 39.31 | 34.94 | 37.91 | 34.81 | 55.71 | 28.31 |
+
+| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought |
+|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+| qwen1.5-0.5b-chat-hf | 30.86 | 31.98 | 44.00 | 27.68 | 47.40 | 40.45 | 55.14 | 35.24 | 32.80 | 30.56 | 58.66 | 57.53 |
+| qwen1.5-1.8b-chat-hf | 54.86 | 62.21 | 69.14 | 53.67 | 82.81 | 83.15 | 85.41 | 58.10 | 44.06 | 49.83 | 82.12 | 82.65 |
+| qwen1.5-4b-chat-hf | 58.86 | 67.44 | 80.00 | 55.93 | 89.58 | 88.20 | 88.11 | 64.29 | 47.08 | 57.48 | 86.59 | 84.93 |
+| qwen1.5-7b-chat-hf | 72.00 | 80.81 | 84.00 | 70.06 | 95.31 | 94.94 | 95.14 | 73.81 | 56.94 | 66.11 | 91.62 | 89.04 |
+| qwen1.5-14b-chat-hf | 84.00 | 83.72 | 90.29 | 80.23 | 97.92 | 94.94 | 98.38 | 81.43 | 63.18 | 74.75 | 93.30 | 96.80 |
+| qwen1.5-32b-chat-hf | 85.71 | 90.12 | 93.71 | 85.31 | 97.92 | 98.31 | 100.00 | 89.05 | 69.82 | 75.75 | 93.85 | 97.72 |
+| qwen1.5-72b-chat-hf | 88.57 | 94.19 | 94.86 | 85.31 | 97.92 | 97.75 | 98.38 | 90.48 | 71.63 | 79.73 | 93.85 | 97.72 |
+| qwen1.5-110b-chat-hf | 86.86 | 92.44 | 94.29 | 85.31 | 98.44 | 98.88 | 98.92 | 95.24 | 78.87 | 86.38 | 95.53 | 99.54 |
+| internlm2-chat-1.8b-hf | 35.43 | 48.84 | 52.00 | 35.03 | 70.31 | 67.98 | 67.03 | 41.43 | 37.83 | 36.88 | 70.95 | 60.73 |
+| internlm2-chat-1.8b-sft-hf | 37.71 | 48.26 | 53.14 | 34.46 | 71.35 | 67.98 | 67.57 | 41.90 | 38.63 | 37.54 | 72.63 | 60.27 |
+| internlm2-chat-7b-hf | 46.29 | 48.26 | 60.57 | 46.89 | 78.65 | 71.91 | 71.35 | 68.10 | 50.30 | 50.83 | 77.09 | 76.26 |
+| internlm2-chat-7b-sft-hf | 46.86 | 48.26 | 61.14 | 45.76 | 77.60 | 71.91 | 71.35 | 67.62 | 50.10 | 50.50 | 77.09 | 75.80 |
+| internlm2-chat-20b-hf | 49.71 | 46.51 | 63.43 | 55.37 | 80.73 | 74.72 | 79.46 | 72.38 | 55.73 | 59.80 | 85.47 | 76.26 |
+| internlm2-chat-20b-sft-hf | 53.71 | 47.09 | 64.00 | 55.37 | 80.73 | 73.60 | 78.92 | 73.81 | 55.53 | 60.13 | 85.47 | 75.80 |
+| llama-3-8b-instruct-hf | 38.86 | 39.53 | 50.29 | 40.11 | 65.10 | 60.11 | 63.78 | 61.43 | 47.89 | 45.85 | 69.27 | 56.16 |
+| llama-3-70b-instruct-hf | 63.43 | 55.23 | 69.71 | 68.36 | 85.42 | 80.90 | 78.38 | 86.19 | 69.01 | 65.12 | 83.24 | 82.65 |
+| llama-3-8b-instruct-lmdeploy | 41.71 | 40.70 | 52.00 | 41.24 | 61.46 | 58.43 | 65.41 | 57.62 | 45.27 | 46.18 | 69.27 | 55.71 |
+| llama-3-70b-instruct-lmdeploy | 61.71 | 53.49 | 70.86 | 64.97 | 88.02 | 83.71 | 77.30 | 84.76 | 68.21 | 60.80 | 80.45 | 79.91 |
+| mistral-7b-instruct-v0.1-hf | 27.43 | 28.49 | 36.00 | 28.25 | 40.10 | 42.70 | 43.78 | 37.14 | 32.80 | 37.87 | 41.90 | 48.86 |
+| mistral-7b-instruct-v0.2-hf | 33.14 | 29.65 | 44.00 | 31.07 | 47.92 | 44.94 | 49.19 | 44.29 | 37.02 | 40.86 | 53.63 | 48.40 |
+| mixtral-8x7b-instruct-v0.1-hf | 46.29 | 40.70 | 54.86 | 42.37 | 58.85 | 60.67 | 57.84 | 54.29 | 50.10 | 46.51 | 69.27 | 52.51 |
+
+| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies |
+|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+| qwen1.5-0.5b-chat-hf | 33.33 | 46.12 | 37.50 | 37.08 | 57.51 | 43.52 | 42.45 | 51.74 | 32.84 | 31.22 | 37.32 | 24.50 |
+| qwen1.5-1.8b-chat-hf | 54.07 | 72.43 | 74.43 | 66.85 | 89.12 | 87.04 | 77.36 | 76.16 | 38.24 | 44.34 | 46.89 | 40.94 |
+| qwen1.5-4b-chat-hf | 60.00 | 84.71 | 82.39 | 69.66 | 94.82 | 90.74 | 79.72 | 78.49 | 41.67 | 57.47 | 54.07 | 56.38 |
+| qwen1.5-7b-chat-hf | 66.30 | 90.73 | 84.66 | 80.90 | 94.30 | 91.67 | 82.55 | 84.88 | 38.73 | 60.18 | 60.77 | 63.42 |
+| qwen1.5-14b-chat-hf | 74.81 | 93.73 | 90.91 | 92.13 | 96.89 | 98.15 | 89.62 | 88.37 | 54.41 | 70.14 | 69.86 | 69.13 |
+| qwen1.5-32b-chat-hf | 80.37 | 94.49 | 93.75 | 94.94 | 97.93 | 97.22 | 90.09 | 90.70 | 68.63 | 78.73 | 73.21 | 77.52 |
+| qwen1.5-72b-chat-hf | 84.07 | 96.74 | 95.45 | 94.94 | 97.93 | 95.37 | 92.92 | 91.28 | 63.73 | 80.09 | 73.68 | 83.89 |
+| qwen1.5-110b-chat-hf | 90.37 | 96.99 | 96.02 | 95.51 | 98.45 | 98.15 | 93.87 | 94.19 | 81.37 | 86.88 | 84.69 | 90.94 |
+| internlm2-chat-1.8b-hf | 48.15 | 65.41 | 69.32 | 54.49 | 79.27 | 70.37 | 60.85 | 64.53 | 32.35 | 32.58 | 45.45 | 40.60 |
+| internlm2-chat-1.8b-sft-hf | 48.15 | 64.91 | 69.89 | 53.93 | 79.27 | 70.37 | 61.32 | 63.95 | 33.82 | 29.86 | 45.45 | 39.93 |
+| internlm2-chat-7b-hf | 66.67 | 85.21 | 73.30 | 66.85 | 91.19 | 76.85 | 70.28 | 75.58 | 42.16 | 50.68 | 60.77 | 70.47 |
+| internlm2-chat-7b-sft-hf | 67.04 | 85.21 | 73.86 | 66.85 | 90.67 | 77.78 | 71.70 | 75.00 | 42.16 | 51.13 | 60.29 | 72.15 |
+| internlm2-chat-20b-hf | 74.07 | 85.96 | 75.57 | 77.53 | 89.12 | 76.85 | 72.64 | 83.72 | 51.96 | 56.11 | 68.42 | 73.49 |
+| internlm2-chat-20b-sft-hf | 73.70 | 85.46 | 76.70 | 78.09 | 89.64 | 76.85 | 72.17 | 84.88 | 50.00 | 56.56 | 66.99 | 75.17 |
+| llama-3-8b-instruct-hf | 55.93 | 67.42 | 55.68 | 55.06 | 72.02 | 62.04 | 54.25 | 66.86 | 44.12 | 40.72 | 47.37 | 44.63 |
+| llama-3-70b-instruct-hf | 71.11 | 84.21 | 74.43 | 73.03 | 84.97 | 80.56 | 69.81 | 78.49 | 57.35 | 50.68 | 57.89 | 64.43 |
+| llama-3-8b-instruct-lmdeploy | 54.81 | 67.17 | 58.52 | 53.37 | 72.54 | 62.04 | 57.08 | 63.95 | 44.12 | 37.56 | 46.89 | 42.62 |
+| llama-3-70b-instruct-lmdeploy | 70.37 | 82.96 | 72.16 | 71.91 | 83.94 | 82.41 | 69.34 | 77.91 | 55.39 | 50.68 | 56.46 | 64.09 |
+| mistral-7b-instruct-v0.1-hf | 39.63 | 46.62 | 33.52 | 41.01 | 56.48 | 45.37 | 36.32 | 43.60 | 29.90 | 31.67 | 39.71 | 31.88 |
+| mistral-7b-instruct-v0.2-hf | 46.30 | 54.39 | 39.20 | 43.26 | 61.66 | 51.85 | 35.38 | 55.23 | 28.92 | 35.29 | 37.80 | 29.19 |
+| mixtral-8x7b-instruct-v0.1-hf | 58.52 | 66.17 | 56.82 | 57.30 | 66.32 | 62.04 | 48.11 | 66.28 | 41.67 | 37.10 | 46.41 | 35.91 |
+
+| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant |
+|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+| qwen1.5-0.5b-chat-hf | 36.47 | 39.07 | 27.53 | 41.76 | 45.89 | 39.63 | 35.56 | 31.66 | 37.71 | 34.00 | 32.78 | 37.25 |
+| qwen1.5-1.8b-chat-hf | 56.02 | 45.58 | 39.33 | 67.03 | 84.54 | 49.42 | 48.89 | 51.76 | 47.43 | 50.50 | 45.69 | 52.14 |
+| qwen1.5-4b-chat-hf | 61.28 | 52.56 | 42.70 | 73.08 | 85.99 | 55.48 | 59.44 | 55.28 | 60.57 | 57.00 | 50.00 | 58.01 |
+| qwen1.5-7b-chat-hf | 73.31 | 56.28 | 58.99 | 82.97 | 88.41 | 64.57 | 66.67 | 63.82 | 77.14 | 75.50 | 57.42 | 69.07 |
+| qwen1.5-14b-chat-hf | 80.83 | 65.12 | 70.79 | 89.56 | 93.24 | 67.60 | 72.78 | 68.34 | 80.57 | 80.00 | 61.72 | 75.62 |
+| qwen1.5-32b-chat-hf | 87.59 | 72.56 | 76.40 | 90.66 | 95.65 | 74.36 | 80.00 | 80.40 | 86.86 | 84.00 | 74.88 | 85.33 |
+| qwen1.5-72b-chat-hf | 90.98 | 76.28 | 75.84 | 90.66 | 95.65 | 75.52 | 84.44 | 82.91 | 91.43 | 89.00 | 73.92 | 85.10 |
+| qwen1.5-110b-chat-hf | 95.11 | 88.37 | 82.58 | 91.76 | 96.62 | 87.65 | 91.67 | 90.95 | 93.71 | 95.00 | 87.08 | 91.87 |
+| internlm2-chat-1.8b-hf | 54.14 | 40.00 | 27.53 | 62.09 | 70.53 | 44.99 | 41.67 | 51.76 | 45.71 | 39.00 | 40.67 | 39.28 |
+| internlm2-chat-1.8b-sft-hf | 54.14 | 42.33 | 26.97 | 61.54 | 71.98 | 45.45 | 41.67 | 50.25 | 45.14 | 37.50 | 41.39 | 40.63 |
+| internlm2-chat-7b-hf | 70.68 | 44.19 | 34.83 | 73.63 | 84.06 | 51.98 | 57.22 | 68.34 | 66.86 | 57.50 | 54.55 | 50.11 |
+| internlm2-chat-7b-sft-hf | 71.80 | 44.65 | 37.64 | 73.63 | 84.06 | 51.98 | 57.78 | 67.84 | 65.71 | 60.50 | 54.55 | 50.11 |
+| internlm2-chat-20b-hf | 75.56 | 54.42 | 42.13 | 74.73 | 85.51 | 57.34 | 65.56 | 67.84 | 73.71 | 64.00 | 57.89 | 55.98 |
+| internlm2-chat-20b-sft-hf | 76.32 | 55.35 | 41.01 | 75.27 | 85.51 | 58.28 | 65.56 | 67.34 | 72.57 | 65.00 | 58.37 | 56.43 |
+| llama-3-8b-instruct-hf | 53.01 | 44.65 | 33.15 | 46.70 | 66.18 | 45.22 | 58.89 | 61.81 | 62.86 | 57.50 | 48.33 | 49.89 |
+| llama-3-70b-instruct-hf | 71.43 | 50.70 | 30.90 | 71.43 | 82.13 | 59.67 | 73.33 | 73.37 | 82.86 | 82.00 | 59.09 | 62.08 |
+| llama-3-8b-instruct-lmdeploy | 51.13 | 45.12 | 29.78 | 43.96 | 62.32 | 47.09 | 56.11 | 54.77 | 56.00 | 56.00 | 49.04 | 47.40 |
+| llama-3-70b-instruct-lmdeploy | 68.80 | 48.84 | 30.90 | 70.88 | 81.64 | 58.28 | 72.22 | 70.85 | 80.00 | 81.00 | 57.66 | 62.53 |
+| mistral-7b-instruct-v0.1-hf | 30.45 | 35.81 | 24.72 | 40.11 | 34.78 | 30.77 | 43.89 | 38.69 | 36.57 | 32.50 | 44.74 | 34.09 |
+| mistral-7b-instruct-v0.2-hf | 36.09 | 38.14 | 23.03 | 43.41 | 45.41 | 35.90 | 50.00 | 41.71 | 42.86 | 36.00 | 45.22 | 42.21 |
+| mixtral-8x7b-instruct-v0.1-hf | 47.37 | 44.65 | 30.34 | 51.65 | 60.87 | 42.19 | 53.89 | 58.29 | 52.00 | 47.00 | 48.56 | 44.02 |
+
+| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician |
+|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+| qwen1.5-0.5b-chat-hf | 27.66 | 38.43 | 32.28 | 35.44 |
+| qwen1.5-1.8b-chat-hf | 38.65 | 46.62 | 46.73 | 59.14 |
+| qwen1.5-4b-chat-hf | 49.29 | 54.80 | 51.02 | 70.20 |
+| qwen1.5-7b-chat-hf | 53.90 | 62.28 | 57.79 | 76.52 |
+| qwen1.5-14b-chat-hf | 58.87 | 65.12 | 67.27 | 86.68 |
+| qwen1.5-32b-chat-hf | 74.11 | 70.82 | 74.94 | 88.04 |
+| qwen1.5-72b-chat-hf | 74.82 | 75.09 | 78.56 | 89.39 |
+| qwen1.5-110b-chat-hf | 88.30 | 88.97 | 94.13 | 95.49 |
+| internlm2-chat-1.8b-hf | 30.14 | 41.99 | 34.54 | 46.73 |
+| internlm2-chat-1.8b-sft-hf | 30.14 | 43.06 | 34.31 | 47.86 |
+| internlm2-chat-7b-hf | 42.20 | 52.31 | 47.63 | 66.82 |
+| internlm2-chat-7b-sft-hf | 43.26 | 52.67 | 47.86 | 66.59 |
+| internlm2-chat-20b-hf | 45.74 | 54.80 | 51.02 | 69.07 |
+| internlm2-chat-20b-sft-hf | 45.74 | 55.16 | 51.02 | 68.62 |
+| llama-3-8b-instruct-hf | 37.59 | 50.53 | 42.44 | 68.40 |
+| llama-3-70b-instruct-hf | 50.71 | 64.06 | 55.53 | 84.42 |
+| llama-3-8b-instruct-lmdeploy | 37.94 | 50.53 | 41.53 | 66.14 |
+| llama-3-70b-instruct-lmdeploy | 48.94 | 63.70 | 53.95 | 81.72 |
+| mistral-7b-instruct-v0.1-hf | 27.66 | 39.15 | 29.35 | 39.95 |
+| mistral-7b-instruct-v0.2-hf | 32.27 | 37.01 | 32.96 | 42.89 |
+| mixtral-8x7b-instruct-v0.1-hf | 36.88 | 48.75 | 41.76 | 53.05 |
+
+### Details on Dev Split
diff --git a/configs/datasets/drop/drop_gen.py b/configs/datasets/drop/drop_gen.py
index e3a45f62..473b615d 100644
--- a/configs/datasets/drop/drop_gen.py
+++ b/configs/datasets/drop/drop_gen.py
@@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
- from .drop_gen_822d33 import drop_datasets
+ from .drop_gen_e18aaf import drop_datasets
diff --git a/configs/datasets/gpqa/README.md b/configs/datasets/gpqa/README.md
new file mode 100644
index 00000000..02560df5
--- /dev/null
+++ b/configs/datasets/gpqa/README.md
@@ -0,0 +1,69 @@
+# GPQA
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
+```
+
+## Base Models
+
+| model | GPQA_diamond |
+|:------------------------:|---------------:|
+| llama-7b-turbomind | 24.24 |
+| llama-13b-turbomind | 25.25 |
+| llama-30b-turbomind | 22.73 |
+| llama-65b-turbomind | 21.72 |
+| llama-2-7b-turbomind | 25.25 |
+| llama-2-13b-turbomind | 23.74 |
+| llama-2-70b-turbomind | 28.28 |
+| llama-3-8b-turbomind | 31.82 |
+| llama-3-70b-turbomind | 40.91 |
+| internlm2-1.8b-turbomind | 24.24 |
+| internlm2-7b-turbomind | 28.28 |
+| internlm2-20b-turbomind | 31.31 |
+| qwen-1.8b-turbomind | 28.79 |
+| qwen-7b-turbomind | 24.75 |
+| qwen-14b-turbomind | 27.78 |
+| qwen-72b-turbomind | 31.31 |
+| qwen1.5-0.5b-hf | 23.74 |
+| qwen1.5-1.8b-hf | 28.79 |
+| qwen1.5-4b-hf | 23.23 |
+| qwen1.5-7b-hf | 20.71 |
+| qwen1.5-14b-hf | 32.32 |
+| qwen1.5-32b-hf | 30.81 |
+| qwen1.5-72b-hf | 31.82 |
+| qwen1.5-moe-a2-7b-hf | 28.79 |
+| mistral-7b-v0.1-hf | 24.75 |
+| mistral-7b-v0.2-hf | 23.74 |
+| mixtral-8x7b-v0.1-hf | 28.79 |
+| mixtral-8x22b-v0.1-hf | 36.36 |
+| yi-6b-hf | 28.28 |
+| yi-34b-hf | 35.86 |
+| deepseek-7b-base-hf | 20.71 |
+| deepseek-67b-base-hf | 25.25 |
+
+## Chat Models
+
+| model | GPQA_diamond |
+|:-----------------------------:|---------------:|
+| qwen1.5-0.5b-chat-hf | 19.70 |
+| qwen1.5-1.8b-chat-hf | 29.80 |
+| qwen1.5-4b-chat-hf | 25.25 |
+| qwen1.5-7b-chat-hf | 31.82 |
+| qwen1.5-14b-chat-hf | 30.30 |
+| qwen1.5-32b-chat-hf | 31.31 |
+| qwen1.5-72b-chat-hf | 32.83 |
+| qwen1.5-110b-chat-hf | 35.86 |
+| internlm2-chat-1.8b-hf | 25.76 |
+| internlm2-chat-1.8b-sft-hf | 26.26 |
+| internlm2-chat-7b-hf | 28.28 |
+| internlm2-chat-7b-sft-hf | 27.27 |
+| internlm2-chat-20b-hf | 30.30 |
+| internlm2-chat-20b-sft-hf | 29.29 |
+| llama-3-8b-instruct-hf | 25.76 |
+| llama-3-70b-instruct-hf | 37.88 |
+| llama-3-8b-instruct-lmdeploy | 25.76 |
+| llama-3-70b-instruct-lmdeploy | 37.88 |
+| mistral-7b-instruct-v0.1-hf | 30.30 |
+| mistral-7b-instruct-v0.2-hf | 25.25 |
+| mixtral-8x7b-instruct-v0.1-hf | 30.30 |
diff --git a/configs/datasets/gsm8k/README.md b/configs/datasets/gsm8k/README.md
new file mode 100644
index 00000000..e4b9f9f6
--- /dev/null
+++ b/configs/datasets/gsm8k/README.md
@@ -0,0 +1,69 @@
+# GSM8K
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
+```
+
+## Base Models
+
+| model | gsm8k |
+|:------------------------:|--------:|
+| llama-7b-turbomind | 10.31 |
+| llama-13b-turbomind | 20.55 |
+| llama-30b-turbomind | 42.08 |
+| llama-65b-turbomind | 54.81 |
+| llama-2-7b-turbomind | 16.76 |
+| llama-2-13b-turbomind | 29.87 |
+| llama-2-70b-turbomind | 63.53 |
+| llama-3-8b-turbomind | 54.28 |
+| llama-3-70b-turbomind | 69.98 |
+| internlm2-1.8b-turbomind | 30.40 |
+| internlm2-7b-turbomind | 69.98 |
+| internlm2-20b-turbomind | 76.80 |
+| qwen-1.8b-turbomind | 23.73 |
+| qwen-7b-turbomind | 54.36 |
+| qwen-14b-turbomind | 61.64 |
+| qwen-72b-turbomind | 79.68 |
+| qwen1.5-0.5b-hf | 13.27 |
+| qwen1.5-1.8b-hf | 34.87 |
+| qwen1.5-4b-hf | 47.61 |
+| qwen1.5-7b-hf | 54.36 |
+| qwen1.5-14b-hf | 63.53 |
+| qwen1.5-32b-hf | 72.71 |
+| qwen1.5-72b-hf | 79.53 |
+| qwen1.5-moe-a2-7b-hf | 61.26 |
+| mistral-7b-v0.1-hf | 47.61 |
+| mistral-7b-v0.2-hf | 45.19 |
+| mixtral-8x7b-v0.1-hf | 66.26 |
+| mixtral-8x22b-v0.1-hf | 82.87 |
+| yi-6b-hf | 39.58 |
+| yi-34b-hf | 67.70 |
+| deepseek-7b-base-hf | 20.17 |
+| deepseek-67b-base-hf | 68.16 |
+
+## Chat Models
+
+| model | gsm8k |
+|:-----------------------------:|--------:|
+| qwen1.5-0.5b-chat-hf | 8.79 |
+| qwen1.5-1.8b-chat-hf | 27.60 |
+| qwen1.5-4b-chat-hf | 47.61 |
+| qwen1.5-7b-chat-hf | 56.25 |
+| qwen1.5-14b-chat-hf | 64.90 |
+| qwen1.5-32b-chat-hf | 79.91 |
+| qwen1.5-72b-chat-hf | 77.03 |
+| qwen1.5-110b-chat-hf | 79.53 |
+| internlm2-chat-1.8b-hf | 39.73 |
+| internlm2-chat-1.8b-sft-hf | 36.85 |
+| internlm2-chat-7b-hf | 69.90 |
+| internlm2-chat-7b-sft-hf | 69.83 |
+| internlm2-chat-20b-hf | 75.21 |
+| internlm2-chat-20b-sft-hf | 76.95 |
+| llama-3-8b-instruct-hf | 79.53 |
+| llama-3-70b-instruct-hf | 89.76 |
+| llama-3-8b-instruct-lmdeploy | 78.77 |
+| llama-3-70b-instruct-lmdeploy | 89.31 |
+| mistral-7b-instruct-v0.1-hf | 42.23 |
+| mistral-7b-instruct-v0.2-hf | 45.56 |
+| mixtral-8x7b-instruct-v0.1-hf | 65.13 |
diff --git a/configs/datasets/hellaswag/README.md b/configs/datasets/hellaswag/README.md
new file mode 100644
index 00000000..91a5e226
--- /dev/null
+++ b/configs/datasets/hellaswag/README.md
@@ -0,0 +1,69 @@
+# HellaSwag
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
+```
+
+## Base Models
+
+| model | hellaswag |
+|:------------------------:|------------:|
+| llama-7b-turbomind | 26.99 |
+| llama-13b-turbomind | 34.21 |
+| llama-30b-turbomind | 35.65 |
+| llama-65b-turbomind | 44.63 |
+| llama-2-7b-turbomind | 29.29 |
+| llama-2-13b-turbomind | 45.06 |
+| llama-2-70b-turbomind | 55.91 |
+| llama-3-8b-turbomind | 50.86 |
+| llama-3-70b-turbomind | 80.60 |
+| internlm2-1.8b-turbomind | 44.86 |
+| internlm2-7b-turbomind | 89.52 |
+| internlm2-20b-turbomind | 91.41 |
+| qwen-1.8b-turbomind | 38.04 |
+| qwen-7b-turbomind | 64.62 |
+| qwen-14b-turbomind | 85.88 |
+| qwen-72b-turbomind | 90.40 |
+| qwen1.5-0.5b-hf | 29.19 |
+| qwen1.5-1.8b-hf | 42.32 |
+| qwen1.5-4b-hf | 55.89 |
+| qwen1.5-7b-hf | 68.51 |
+| qwen1.5-14b-hf | 83.86 |
+| qwen1.5-32b-hf | 87.28 |
+| qwen1.5-72b-hf | 90.41 |
+| qwen1.5-moe-a2-7b-hf | 72.42 |
+| mistral-7b-v0.1-hf | 42.04 |
+| mistral-7b-v0.2-hf | 46.24 |
+| mixtral-8x7b-v0.1-hf | 66.22 |
+| mixtral-8x22b-v0.1-hf | 79.66 |
+| yi-6b-hf | 66.83 |
+| yi-34b-hf | 83.83 |
+| deepseek-7b-base-hf | 30.42 |
+| deepseek-67b-base-hf | 70.75 |
+
+## Chat Models
+
+| model | hellaswag |
+|:-----------------------------:|------------:|
+| qwen1.5-0.5b-chat-hf | 29.60 |
+| qwen1.5-1.8b-chat-hf | 41.71 |
+| qwen1.5-4b-chat-hf | 60.45 |
+| qwen1.5-7b-chat-hf | 71.58 |
+| qwen1.5-14b-chat-hf | 79.70 |
+| qwen1.5-32b-chat-hf | 88.56 |
+| qwen1.5-72b-chat-hf | 89.37 |
+| qwen1.5-110b-chat-hf | 91.11 |
+| internlm2-chat-1.8b-hf | 60.47 |
+| internlm2-chat-1.8b-sft-hf | 61.58 |
+| internlm2-chat-7b-hf | 84.80 |
+| internlm2-chat-7b-sft-hf | 85.21 |
+| internlm2-chat-20b-hf | 88.48 |
+| internlm2-chat-20b-sft-hf | 88.95 |
+| llama-3-8b-instruct-hf | 74.39 |
+| llama-3-70b-instruct-hf | 89.07 |
+| llama-3-8b-instruct-lmdeploy | 73.31 |
+| llama-3-70b-instruct-lmdeploy | 87.28 |
+| mistral-7b-instruct-v0.1-hf | 53.00 |
+| mistral-7b-instruct-v0.2-hf | 65.72 |
+| mixtral-8x7b-instruct-v0.1-hf | 76.16 |
diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md
new file mode 100644
index 00000000..c0e1e157
--- /dev/null
+++ b/configs/datasets/humaneval/README.md
@@ -0,0 +1,69 @@
+# HumanEval
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+```
+
+## Base Models
+
+| model | pass@1 |
+|:------------------------:|---------:|
+| llama-7b-turbomind | 12.80 |
+| llama-13b-turbomind | 15.24 |
+| llama-30b-turbomind | 9.15 |
+| llama-65b-turbomind | 7.32 |
+| llama-2-7b-turbomind | 14.02 |
+| llama-2-13b-turbomind | 15.24 |
+| llama-2-70b-turbomind | 15.24 |
+| llama-3-8b-turbomind | 28.05 |
+| llama-3-70b-turbomind | 28.05 |
+| internlm2-1.8b-turbomind | 30.49 |
+| internlm2-7b-turbomind | 48.17 |
+| internlm2-20b-turbomind | 51.83 |
+| qwen-1.8b-turbomind | 16.46 |
+| qwen-7b-turbomind | 23.78 |
+| qwen-14b-turbomind | 23.78 |
+| qwen-72b-turbomind | 66.46 |
+| qwen1.5-0.5b-hf | 8.54 |
+| qwen1.5-1.8b-hf | 23.17 |
+| qwen1.5-4b-hf | 41.46 |
+| qwen1.5-7b-hf | 53.05 |
+| qwen1.5-14b-hf | 57.32 |
+| qwen1.5-32b-hf | 70.12 |
+| qwen1.5-72b-hf | 65.85 |
+| qwen1.5-moe-a2-7b-hf | 45.73 |
+| mistral-7b-v0.1-hf | 14.02 |
+| mistral-7b-v0.2-hf | 9.15 |
+| mixtral-8x7b-v0.1-hf | 24.39 |
+| mixtral-8x22b-v0.1-hf | 16.46 |
+| yi-6b-hf | 14.63 |
+| yi-34b-hf | 17.07 |
+| deepseek-7b-base-hf | 18.29 |
+| deepseek-67b-base-hf | 23.17 |
+
+## Chat Models
+
+| model | pass@1 |
+|:-----------------------------:|---------:|
+| qwen1.5-0.5b-chat-hf | 9.15 |
+| qwen1.5-1.8b-chat-hf | 15.85 |
+| qwen1.5-4b-chat-hf | 30.49 |
+| qwen1.5-7b-chat-hf | 40.85 |
+| qwen1.5-14b-chat-hf | 50.00 |
+| qwen1.5-32b-chat-hf | 57.93 |
+| qwen1.5-72b-chat-hf | 60.37 |
+| qwen1.5-110b-chat-hf | 65.24 |
+| internlm2-chat-1.8b-hf | 33.54 |
+| internlm2-chat-1.8b-sft-hf | 34.15 |
+| internlm2-chat-7b-hf | 56.71 |
+| internlm2-chat-7b-sft-hf | 61.59 |
+| internlm2-chat-20b-hf | 67.68 |
+| internlm2-chat-20b-sft-hf | 67.68 |
+| llama-3-8b-instruct-hf | 55.49 |
+| llama-3-70b-instruct-hf | 70.73 |
+| llama-3-8b-instruct-lmdeploy | 57.93 |
+| llama-3-70b-instruct-lmdeploy | 70.73 |
+| mistral-7b-instruct-v0.1-hf | 32.32 |
+| mistral-7b-instruct-v0.2-hf | 29.27 |
+| mixtral-8x7b-instruct-v0.1-hf | 34.15 |
diff --git a/configs/datasets/math/README.md b/configs/datasets/math/README.md
new file mode 100644
index 00000000..c498db34
--- /dev/null
+++ b/configs/datasets/math/README.md
@@ -0,0 +1,69 @@
+# MATH
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
+```
+
+## Base Models
+
+| model | math |
+|:------------------------:|-------:|
+| llama-7b-turbomind | 2.94 |
+| llama-13b-turbomind | 3.84 |
+| llama-30b-turbomind | 6.54 |
+| llama-65b-turbomind | 10.66 |
+| llama-2-7b-turbomind | 3.58 |
+| llama-2-13b-turbomind | 5.30 |
+| llama-2-70b-turbomind | 13.26 |
+| llama-3-8b-turbomind | 16.42 |
+| llama-3-70b-turbomind | 39.64 |
+| internlm2-1.8b-turbomind | 9.42 |
+| internlm2-7b-turbomind | 25.16 |
+| internlm2-20b-turbomind | 32.24 |
+| qwen-1.8b-turbomind | 6.30 |
+| qwen-7b-turbomind | 15.56 |
+| qwen-14b-turbomind | 30.38 |
+| qwen-72b-turbomind | 44.18 |
+| qwen1.5-0.5b-hf | 4.16 |
+| qwen1.5-1.8b-hf | 11.32 |
+| qwen1.5-4b-hf | 17.50 |
+| qwen1.5-7b-hf | 17.34 |
+| qwen1.5-14b-hf | 36.18 |
+| qwen1.5-32b-hf | 45.74 |
+| qwen1.5-72b-hf | 41.56 |
+| qwen1.5-moe-a2-7b-hf | 27.96 |
+| mistral-7b-v0.1-hf | 13.44 |
+| mistral-7b-v0.2-hf | 12.74 |
+| mixtral-8x7b-v0.1-hf | 29.46 |
+| mixtral-8x22b-v0.1-hf | 41.82 |
+| yi-6b-hf | 6.60 |
+| yi-34b-hf | 18.80 |
+| deepseek-7b-base-hf | 4.66 |
+| deepseek-67b-base-hf | 18.76 |
+
+## Chat Models
+
+| model | math |
+|:-----------------------------:|-------:|
+| qwen1.5-0.5b-chat-hf | 0.56 |
+| qwen1.5-1.8b-chat-hf | 4.94 |
+| qwen1.5-4b-chat-hf | 7.34 |
+| qwen1.5-7b-chat-hf | 22.14 |
+| qwen1.5-14b-chat-hf | 32.22 |
+| qwen1.5-32b-chat-hf | 41.80 |
+| qwen1.5-72b-chat-hf | 45.22 |
+| qwen1.5-110b-chat-hf | 54.38 |
+| internlm2-chat-1.8b-hf | 14.06 |
+| internlm2-chat-1.8b-sft-hf | 13.10 |
+| internlm2-chat-7b-hf | 28.08 |
+| internlm2-chat-7b-sft-hf | 27.60 |
+| internlm2-chat-20b-hf | 34.68 |
+| internlm2-chat-20b-sft-hf | 32.54 |
+| llama-3-8b-instruct-hf | 27.50 |
+| llama-3-70b-instruct-hf | 47.52 |
+| llama-3-8b-instruct-lmdeploy | 27.42 |
+| llama-3-70b-instruct-lmdeploy | 46.90 |
+| mistral-7b-instruct-v0.1-hf | 8.48 |
+| mistral-7b-instruct-v0.2-hf | 10.82 |
+| mixtral-8x7b-instruct-v0.1-hf | 27.02 |
diff --git a/configs/datasets/mbpp/README.md b/configs/datasets/mbpp/README.md
new file mode 100644
index 00000000..2f2b6111
--- /dev/null
+++ b/configs/datasets/mbpp/README.md
@@ -0,0 +1,69 @@
+# MBPP
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+```
+
+## Base Models
+
+| model | pass@1 | pass | timeout | failed | wrong_answer |
+|:------------------------:|---------:|-------:|----------:|---------:|---------------:|
+| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 |
+| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 |
+| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 |
+| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 |
+| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 |
+| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 |
+| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 |
+| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 |
+| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 |
+| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 |
+| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 |
+| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 |
+| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 |
+| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 |
+| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 |
+| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 |
+| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 |
+| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 |
+| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 |
+| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 |
+| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 |
+| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 |
+| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 |
+| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 |
+| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 |
+| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 |
+| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 |
+| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 |
+| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 |
+| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 |
+| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 |
+| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 |
+
+## Chat Models
+
+| model | pass@1 | pass | timeout | failed | wrong_answer |
+|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:|
+| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 |
+| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 |
+| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 |
+| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 |
+| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 |
+| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 |
+| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 |
+| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 |
+| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 |
+| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 |
+| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 |
+| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 |
+| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 |
+| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 |
+| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 |
+| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 |
+| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 |
+| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 |
+| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 |
+| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 |
+| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 |
diff --git a/configs/datasets/mmlu/README.md b/configs/datasets/mmlu/README.md
new file mode 100644
index 00000000..eaa2181e
--- /dev/null
+++ b/configs/datasets/mmlu/README.md
@@ -0,0 +1,368 @@
+# MMLU
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+```
+
+## Base Models
+
+| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
+|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
+| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
+| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
+| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
+| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
+| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
+| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
+| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
+| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
+| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
+| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
+| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
+| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
+| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
+| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
+| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
+| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
+| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
+| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
+| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
+| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
+| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
+| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
+| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
+| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
+| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
+| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
+| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
+| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
+| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
+| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
+| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
+
+### Details
+
+| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
+|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
+| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
+| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
+| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
+| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
+| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
+| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
+| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
+| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
+| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
+| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
+| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
+| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
+| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
+| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
+| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
+| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
+| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
+| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
+| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
+| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
+| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
+| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
+| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
+| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
+| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
+| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
+| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
+| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
+| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
+| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
+| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
+
+| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
+|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
+| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
+| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
+| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
+| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
+| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
+| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
+| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
+| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
+| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
+| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
+| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
+| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
+| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
+| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
+| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
+| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
+| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
+| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
+| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
+| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
+| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
+| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
+| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
+| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
+| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
+| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
+| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
+| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
+| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
+| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
+| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
+
+| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
+|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
+| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
+| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
+| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
+| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
+| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
+| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
+| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
+| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
+| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
+| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
+| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
+| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
+| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
+| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
+| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
+| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
+| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
+| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
+| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
+| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
+| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
+| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
+| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
+| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
+| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
+| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
+| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
+| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
+| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
+| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
+| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
+
+| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
+|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
+| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
+| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
+| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
+| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
+| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
+| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
+| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
+| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
+| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
+| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
+| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
+| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
+| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
+| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
+| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
+| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
+| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
+| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
+| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
+| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
+| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
+| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
+| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
+| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
+| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
+| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
+| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
+| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
+| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
+| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
+| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
+
+| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
+|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
+| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
+| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
+| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
+| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
+| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
+| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
+| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
+| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
+| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
+| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
+| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
+| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
+| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
+| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
+| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
+| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
+| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
+| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
+| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
+| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
+| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
+| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
+| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
+| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
+| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
+| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
+| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
+| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
+| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
+| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
+| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
+
+## Chat Models
+
+| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
+|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
+| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
+| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
+| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
+| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
+| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
+| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
+| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
+| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
+| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
+| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
+| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
+| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
+| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
+| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
+| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
+| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
+| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
+| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
+| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
+| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
+
+### Details
+
+| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
+|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
+| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
+| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
+| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
+| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
+| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
+| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
+| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
+| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
+| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
+| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
+| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
+| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
+| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
+| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
+| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
+| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
+| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
+| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
+| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
+| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
+
+| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
+|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
+| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
+| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
+| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
+| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
+| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
+| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
+| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
+| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
+| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
+| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
+| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
+| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
+| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
+| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
+| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
+| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
+| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
+| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
+| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
+| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
+
+| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
+|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
+| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
+| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
+| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
+| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
+| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
+| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
+| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
+| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
+| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
+| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
+| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
+| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
+| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
+| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
+| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
+| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
+| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
+| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
+| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
+| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
+
+| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
+|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
+| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
+| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
+| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
+| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
+| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
+| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
+| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
+| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
+| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
+| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
+| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
+| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
+| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
+| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
+| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
+| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
+| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
+| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
+| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
+| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
+
+| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
+|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
+| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
+| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
+| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
+| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
+| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
+| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
+| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
+| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
+| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
+| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
+| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
+| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
+| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
+| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
+| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
+| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
+| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
+| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
+| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
+| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |
diff --git a/configs/datasets/nq/README.md b/configs/datasets/nq/README.md
new file mode 100644
index 00000000..0cf8e714
--- /dev/null
+++ b/configs/datasets/nq/README.md
@@ -0,0 +1,69 @@
+# NQ
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
+```
+
+## Base Models
+
+| model | nq |
+|:------------------------:|------:|
+| llama-7b-turbomind | 15.10 |
+| llama-13b-turbomind | 16.43 |
+| llama-30b-turbomind | 22.11 |
+| llama-65b-turbomind | 26.09 |
+| llama-2-7b-turbomind | 14.35 |
+| llama-2-13b-turbomind | 21.69 |
+| llama-2-70b-turbomind | 23.27 |
+| llama-3-8b-turbomind | 18.78 |
+| llama-3-70b-turbomind | 31.88 |
+| internlm2-1.8b-turbomind | 20.66 |
+| internlm2-7b-turbomind | 41.05 |
+| internlm2-20b-turbomind | 43.55 |
+| qwen-1.8b-turbomind | 5.68 |
+| qwen-7b-turbomind | 17.87 |
+| qwen-14b-turbomind | 13.77 |
+| qwen-72b-turbomind | 18.20 |
+| qwen1.5-0.5b-hf | 6.01 |
+| qwen1.5-1.8b-hf | 10.28 |
+| qwen1.5-4b-hf | 15.73 |
+| qwen1.5-7b-hf | 18.61 |
+| qwen1.5-14b-hf | 16.07 |
+| qwen1.5-32b-hf | 21.75 |
+| qwen1.5-72b-hf | 20.53 |
+| qwen1.5-moe-a2-7b-hf | 16.62 |
+| mistral-7b-v0.1-hf | 20.66 |
+| mistral-7b-v0.2-hf | 20.78 |
+| mixtral-8x7b-v0.1-hf | 24.85 |
+| mixtral-8x22b-v0.1-hf | 34.43 |
+| yi-6b-hf | 10.08 |
+| yi-34b-hf | 13.96 |
+| deepseek-7b-base-hf | 8.45 |
+| deepseek-67b-base-hf | 17.59 |
+
+## Chat Models
+
+| model | nq |
+|:-----------------------------:|------:|
+| qwen1.5-0.5b-chat-hf | 7.42 |
+| qwen1.5-1.8b-chat-hf | 10.22 |
+| qwen1.5-4b-chat-hf | 19.31 |
+| qwen1.5-7b-chat-hf | 16.87 |
+| qwen1.5-14b-chat-hf | 20.53 |
+| qwen1.5-32b-chat-hf | 25.26 |
+| qwen1.5-72b-chat-hf | 35.21 |
+| qwen1.5-110b-chat-hf | 36.98 |
+| internlm2-chat-1.8b-hf | 19.09 |
+| internlm2-chat-1.8b-sft-hf | 18.14 |
+| internlm2-chat-7b-hf | 28.73 |
+| internlm2-chat-7b-sft-hf | 30.78 |
+| internlm2-chat-20b-hf | 28.75 |
+| internlm2-chat-20b-sft-hf | 34.10 |
+| llama-3-8b-instruct-hf | 30.17 |
+| llama-3-70b-instruct-hf | 40.25 |
+| llama-3-8b-instruct-lmdeploy | 28.28 |
+| llama-3-70b-instruct-lmdeploy | 39.14 |
+| mistral-7b-instruct-v0.1-hf | 22.47 |
+| mistral-7b-instruct-v0.2-hf | 25.18 |
+| mixtral-8x7b-instruct-v0.1-hf | 32.08 |
diff --git a/configs/datasets/race/README.md b/configs/datasets/race/README.md
new file mode 100644
index 00000000..916bdb80
--- /dev/null
+++ b/configs/datasets/race/README.md
@@ -0,0 +1,69 @@
+# RACE
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
+```
+
+## Base Models
+
+| model | race-high | race-middle |
+|:------------------------:|------------:|--------------:|
+| llama-7b-turbomind | 31.30 | 29.53 |
+| llama-13b-turbomind | 35.56 | 40.25 |
+| llama-30b-turbomind | 57.35 | 55.78 |
+| llama-65b-turbomind | 70.21 | 75.35 |
+| llama-2-7b-turbomind | 39.74 | 46.73 |
+| llama-2-13b-turbomind | 57.06 | 60.52 |
+| llama-2-70b-turbomind | 79.02 | 82.17 |
+| llama-3-8b-turbomind | 67.75 | 73.61 |
+| llama-3-70b-turbomind | 85.79 | 90.25 |
+| internlm2-1.8b-turbomind | 64.72 | 70.40 |
+| internlm2-7b-turbomind | 72.56 | 74.16 |
+| internlm2-20b-turbomind | 72.90 | 74.03 |
+| qwen-1.8b-turbomind | 63.09 | 69.29 |
+| qwen-7b-turbomind | 80.30 | 85.38 |
+| qwen-14b-turbomind | 88.11 | 92.06 |
+| qwen-72b-turbomind | 90.62 | 93.59 |
+| qwen1.5-0.5b-hf | 54.66 | 60.38 |
+| qwen1.5-1.8b-hf | 67.27 | 73.33 |
+| qwen1.5-4b-hf | 78.50 | 83.29 |
+| qwen1.5-7b-hf | 82.73 | 86.70 |
+| qwen1.5-14b-hf | 87.99 | 91.85 |
+| qwen1.5-32b-hf | 90.57 | 93.25 |
+| qwen1.5-72b-hf | 90.45 | 93.87 |
+| qwen1.5-moe-a2-7b-hf | 79.56 | 83.57 |
+| mistral-7b-v0.1-hf | 73.58 | 76.25 |
+| mistral-7b-v0.2-hf | 73.67 | 77.09 |
+| mixtral-8x7b-v0.1-hf | 80.13 | 84.61 |
+| mixtral-8x22b-v0.1-hf | 86.56 | 89.62 |
+| yi-6b-hf | 82.93 | 85.72 |
+| yi-34b-hf | 90.94 | 92.76 |
+| deepseek-7b-base-hf | 50.91 | 56.82 |
+| deepseek-67b-base-hf | 83.53 | 88.23 |
+
+## Chat Models
+
+| model | race-high | race-middle |
+|:-----------------------------:|------------:|--------------:|
+| qwen1.5-0.5b-chat-hf | 49.03 | 52.79 |
+| qwen1.5-1.8b-chat-hf | 66.24 | 72.91 |
+| qwen1.5-4b-chat-hf | 73.53 | 80.29 |
+| qwen1.5-7b-chat-hf | 83.28 | 88.09 |
+| qwen1.5-14b-chat-hf | 87.51 | 91.36 |
+| qwen1.5-32b-chat-hf | 91.22 | 93.52 |
+| qwen1.5-72b-chat-hf | 91.11 | 93.38 |
+| qwen1.5-110b-chat-hf | 92.31 | 93.66 |
+| internlm2-chat-1.8b-hf | 73.87 | 81.13 |
+| internlm2-chat-1.8b-sft-hf | 73.81 | 81.69 |
+| internlm2-chat-7b-hf | 84.51 | 88.72 |
+| internlm2-chat-7b-sft-hf | 84.88 | 89.90 |
+| internlm2-chat-20b-hf | 88.02 | 91.43 |
+| internlm2-chat-20b-sft-hf | 88.11 | 91.57 |
+| llama-3-8b-instruct-hf | 81.22 | 86.63 |
+| llama-3-70b-instruct-hf | 89.57 | 93.45 |
+| llama-3-8b-instruct-lmdeploy | 81.02 | 86.14 |
+| llama-3-70b-instruct-lmdeploy | 89.34 | 93.25 |
+| mistral-7b-instruct-v0.1-hf | 69.75 | 74.72 |
+| mistral-7b-instruct-v0.2-hf | 73.30 | 77.58 |
+| mixtral-8x7b-instruct-v0.1-hf | 81.88 | 87.26 |
diff --git a/configs/datasets/subjective/arena_hard/arena_hard_scoring.py b/configs/datasets/subjective/arena_hard/arena_hard_compare.py
similarity index 100%
rename from configs/datasets/subjective/arena_hard/arena_hard_scoring.py
rename to configs/datasets/subjective/arena_hard/arena_hard_compare.py
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare.py b/configs/datasets/subjective/compassbench/compassbench_compare.py
new file mode 100644
index 00000000..942eca7c
--- /dev/null
+++ b/configs/datasets/subjective/compassbench/compassbench_compare.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchDataset
+
+subjective_reader_cfg = dict(
+ input_columns=['question', 'judge_prompt'],
+ output_column='judge',
+ )
+
+data_path ='data/subjective/compassbench'
+
+subjective_datasets = []
+
+versions = ['CompassbenchV1']
+
+for version_abbr in versions:
+ subjective_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[
+ dict(
+ role='HUMAN',
+ prompt='{question}'
+ ),
+ ]),
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+ )
+
+ subjective_eval_cfg = dict(
+ evaluator=dict(
+ type=LMEvaluator,
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[
+ dict(
+ role='HUMAN',
+ prompt = '{judge_prompt}'
+ ),
+ ]),
+ ),
+ ),
+ pred_role='BOT',
+ )
+
+ subjective_datasets.append(
+ dict(
+ abbr=version_abbr,
+ type=CompassBenchDataset,
+ path=data_path,
+ name=version_abbr,
+ reader_cfg=subjective_reader_cfg,
+ infer_cfg=subjective_infer_cfg,
+ eval_cfg=subjective_eval_cfg
+ ))
diff --git a/configs/datasets/subjective/fofo/README.md b/configs/datasets/subjective/fofo/README.md
new file mode 100644
index 00000000..cc39b20b
--- /dev/null
+++ b/configs/datasets/subjective/fofo/README.md
@@ -0,0 +1,30 @@
+# Fofo
+## Introduction
+This paper presents FoFo, a pioneering benchmark for evaluating large language models' (LLMs) ability to follow complex, domain-specific formats, a crucial yet underexamined capability for their application as AI agents. Despite LLMs' advancements, existing benchmarks fail to assess their format-following proficiency adequately. FoFo fills this gap with a diverse range of real-world formats and instructions, developed through an AI-Human collaborative method. Our evaluation across both open-source (e.g., Llama 2, WizardLM) and closed-source (e.g., GPT-4, PALM2, Gemini) LLMs highlights three key findings: open-source models significantly lag behind closed-source ones in format adherence; LLMs' format-following performance is independent of their content generation quality; and LLMs' format proficiency varies across different domains. These insights suggest the need for specialized tuning for format-following skills and highlight FoFo's role in guiding the selection of domain-specific AI agents.
+
+## Official link
+https://github.com/SalesforceAIResearch/FoFo/tree/main
+
+### Paper
+https://arxiv.org/abs/2402.18667
+
+## Examples
+Input example I:
+```
+Create a detailed medical diagnostic report in JSON format for a hypothetical patient based on the following clinical scenario and laboratory results. \n\n**Clinical Scenario:**\n- Patient Identifier: 12345X\n- Gender: Female\n- Age: 40 years\n- Presenting Complaint: Acute onset of sharp, right lower quadrant abdominal pain that began approximately 6 hours ago\n- Past Medical History: Hypertension, well-controlled on medication; no known allergies; nonsmoker; nulliparous\n- Recent Labs: Slight leukocytosis, normal hemoglobin, elevated C-reactive protein\n- Imaging: Ultrasound indicates a thickened wall of the appendix with peri-appendiceal fluid collection\n- Surgery: The patient underwent an emergency laparoscopic appendectomy\n- Pathology Report: Confirmed acute appendicitis with peri-appendiceal abscess formation, no malignancy noted\n\nUsing the sample data provided above, generate a JSON object with the following structure:\n\n1. Basic Patient Information:\n - PatientID\n - Gender\n - Age\n\n2. Encounter details:\n - Presentation\n - Symptoms\n - DurationOfSymptoms\n - PastMedicalHistory\n - RecentLaboratoryFindings (nested object with LabName as key and the value as another object containing Result and ReferenceRange)\n - ImagingResults (array of objects with ImagingType, Findings, and Conclusion)\n\n3. Surgical Intervention:\n - SurgeryType\n - Urgency (Elective or Emergency)\n - Outcome (Success or Complication, followed by a brief description)\n\n4. Pathology Report:\n - Diagnosis\n - Findings\n - Implications\n \n5. Recommendations:\n - Follow up actions\n - Medications (array of objects with DrugName, Dosage, Frequency, and Duration)\n - Warnings (e.g., warning signs of complications to watch for post-operation)\n\nYou are allowed to make up any additional relevant data, such as reference ranges for the lab results or specific medication names and dosages for the follow-up recommendations. The JSON must strictly adhere to proper JSON formatting rules, including the use of curly brackets for objects, square brackets for arrays, and double quotes for strings. Keep key names succinct and ensure they accurately reflect their corresponding data. The output should be directly usable as a comprehensive digital diagnostic report for the hypothetical patient described.
+```
+
+Output example I (from GPT-4):
+```
+\n \"BasicPatientInformation\": {\n \"PatientID\": \"12345X\",\n \"Gender\": \"Female\",\n \"Age\": 40\n },\n \"EncounterDetails\": {\n \"Presentation\": \"Acute onset of sharp, right lower quadrant abdominal pain\",\n \"Symptoms\": \"Sharp abdominal pain\",\n \"DurationOfSymptoms\": \"6 hours\",\n \"PastMedicalHistory\": \"Hypertension, well-controlled on medication; no known allergies; nonsmoker; nulliparous\",\n \"RecentLaboratoryFindings\": {\n \"LeukocyteCount\": {\n \"Result\": \"Slightly elevated\",\n \"ReferenceRange\": \"4.5-11.0 x 10^9\/L\"\n },\n \"Hemoglobin\": {\n \"Result\": \"Normal\",\n \"ReferenceRange\": \"12.0-16.0 g\/dL\"\n },\n \"CReactiveProtein\": {\n \"Result\": \"Elevated\",\n \"ReferenceRange\": \"0-5 mg\/L\"\n }\n },\n \"ImagingResults\": [\n {\n \"ImagingType\": \"Ultrasound\",\n \"Findings\": \"Thickened wall of the appendix with peri-appendiceal fluid collection\",\n \"Conclusion\": \"Appendix inflammation indicative of acute appendicitis\"\n }\n ]\n },\n \"SurgicalIntervention\": {\n \"SurgeryType\": \"Laparoscopic Appendectomy\",\n \"Urgency\": \"Emergency\",\n \"Outcome\": \"Success, no post-operative complications noted\"\n },\n \"PathologyReport\": {\n \"Diagnosis\": \"Acute appendicitis with peri-appendiceal abscess formation\",\n \"Findings\": \"Inflammation and abscess formation in the appendix, no malignancy\",\n \"Implications\": \"Appendix successfully removed, no malignancy found\"\n },\n \"Recommendations\": {\n \"FollowUpActions\": \"Patient should rest and recover, and attend a follow-up appointment in 2 weeks\",\n \"Medications\": [\n {\n \"DrugName\": \"Ibuprofen\",\n \"Dosage\": \"400 mg\",\n \"Frequency\": \"Every 6 hours\",\n \"Duration\": \"1 week\"\n },\n {\n \"DrugName\": \"Paracetamol\",\n \"Dosage\": \"500 mg\",\n \"Frequency\": \"Every 4 hours\",\n \"Duration\": \"1 week\"\n }\n ],\n \"Warnings\": \"Contact healthcare provider if pain persists or worsens, if fever develops, or if there are any signs of infection at the surgical site\"\n }\n
+```
+
+## Reference
+```
+@article{xia2024fofo,
+ title={FOFO: A Benchmark to Evaluate LLMs' Format-Following Capability},
+ author={Xia, Congying and Xing, Chen and Du, Jiangshu and Yang, Xinyi and Feng, Yihao and Xu, Ran and Yin, Wenpeng and Xiong, Caiming},
+ journal={arXiv preprint arXiv:2402.18667},
+ year={2024}
+}
+```
diff --git a/configs/datasets/subjective/fofo/fofo_judge.py b/configs/datasets/subjective/fofo/fofo_judge.py
new file mode 100644
index 00000000..c66fe9ba
--- /dev/null
+++ b/configs/datasets/subjective/fofo/fofo_judge.py
@@ -0,0 +1,96 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+ input_columns=['question'],
+ output_column='judge',
+ )
+
+subjective_all_sets = [
+ 'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+ "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+ {
+ "model": "model",
+ "answer": "{prediction}"
+ },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+ {
+ 'model': ,
+ 'format_correctness': ,
+ 'reasons':
+ }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+ subjective_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[
+ dict(
+ role='HUMAN',
+ prompt='{question}'
+ ),
+ ]),
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=4096),
+ )
+
+ subjective_eval_cfg = dict(
+ evaluator=dict(
+ type=LMEvaluator,
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(
+ begin=[
+ dict(
+ role='SYSTEM',
+ fallback_role='HUMAN',
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+ ],
+ round=[
+ dict(
+ role='HUMAN',
+ prompt = base_prompt
+ ),
+ ]),
+ ),
+ ),
+ pred_role='BOT',
+ )
+
+ subjective_datasets.append(
+ dict(
+ abbr=f'{_name}',
+ type=FofoDataset,
+ path='./data/subjective/fofo',
+ name=_name,
+ reader_cfg=subjective_reader_cfg,
+ infer_cfg=subjective_infer_cfg,
+ eval_cfg=subjective_eval_cfg
+ ))
diff --git a/configs/datasets/subjective/multiround/mtbench101_judge.py b/configs/datasets/subjective/multiround/mtbench101_judge.py
new file mode 100644
index 00000000..eb1030bf
--- /dev/null
+++ b/configs/datasets/subjective/multiround/mtbench101_judge.py
@@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBench101Dataset
+
+
+subjective_reader_cfg = dict(
+ input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
+ output_column='judge',
+ )
+
+subjective_all_sets = [
+ 'mtbench101',
+]
+data_path ='data/subjective/'
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+ subjective_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template="""{dialogue}""",
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+ )
+
+ subjective_eval_cfg = dict(
+ evaluator=dict(
+ type=LMEvaluator,
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(
+ begin=[
+ dict(
+ role='SYSTEM',
+ fallback_role='HUMAN',
+ prompt='{system_prompt}')
+ ],
+ round=[
+ dict(
+ role='HUMAN',
+ prompt = '{prompt_template}'
+ ),
+ ]),
+ ),
+ ),
+ pred_role='BOT',
+ )
+
+ subjective_datasets.append(
+ dict(
+ abbr=f'{_name}',
+ type=MTBench101Dataset,
+ path=data_path,
+ name=_name,
+ reader_cfg=subjective_reader_cfg,
+ infer_cfg=subjective_infer_cfg,
+ eval_cfg=subjective_eval_cfg
+ ))
diff --git a/configs/datasets/triviaqa/README.md b/configs/datasets/triviaqa/README.md
new file mode 100644
index 00000000..c849ed12
--- /dev/null
+++ b/configs/datasets/triviaqa/README.md
@@ -0,0 +1,69 @@
+# TriviaQA
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+```
+
+## Base Models
+
+| model | triviaqa |
+|:------------------------:|-----------:|
+| llama-7b-turbomind | 40.39 |
+| llama-13b-turbomind | 66.41 |
+| llama-30b-turbomind | 75.90 |
+| llama-65b-turbomind | 82.26 |
+| llama-2-7b-turbomind | 43.21 |
+| llama-2-13b-turbomind | 71.32 |
+| llama-2-70b-turbomind | 67.45 |
+| llama-3-8b-turbomind | 71.24 |
+| llama-3-70b-turbomind | 88.16 |
+| internlm2-1.8b-turbomind | 38.42 |
+| internlm2-7b-turbomind | 69.15 |
+| internlm2-20b-turbomind | 74.03 |
+| qwen-1.8b-turbomind | 22.76 |
+| qwen-7b-turbomind | 53.61 |
+| qwen-14b-turbomind | 49.72 |
+| qwen-72b-turbomind | 79.13 |
+| qwen1.5-0.5b-hf | 21.24 |
+| qwen1.5-1.8b-hf | 34.32 |
+| qwen1.5-4b-hf | 44.59 |
+| qwen1.5-7b-hf | 56.60 |
+| qwen1.5-14b-hf | 59.96 |
+| qwen1.5-32b-hf | 56.20 |
+| qwen1.5-72b-hf | 77.81 |
+| qwen1.5-moe-a2-7b-hf | 65.49 |
+| mistral-7b-v0.1-hf | 72.93 |
+| mistral-7b-v0.2-hf | 70.91 |
+| mixtral-8x7b-v0.1-hf | 85.05 |
+| mixtral-8x22b-v0.1-hf | 89.47 |
+| yi-6b-hf | 23.76 |
+| yi-34b-hf | 14.73 |
+| deepseek-7b-base-hf | 59.48 |
+| deepseek-67b-base-hf | 72.15 |
+
+## Chat Models
+
+| model | triviaqa |
+|:-----------------------------:|-----------:|
+| qwen1.5-0.5b-chat-hf | 19.84 |
+| qwen1.5-1.8b-chat-hf | 35.81 |
+| qwen1.5-4b-chat-hf | 48.93 |
+| qwen1.5-7b-chat-hf | 53.65 |
+| qwen1.5-14b-chat-hf | 62.58 |
+| qwen1.5-32b-chat-hf | 74.72 |
+| qwen1.5-72b-chat-hf | 83.25 |
+| qwen1.5-110b-chat-hf | 86.20 |
+| internlm2-chat-1.8b-hf | 46.69 |
+| internlm2-chat-1.8b-sft-hf | 46.50 |
+| internlm2-chat-7b-hf | 69.54 |
+| internlm2-chat-7b-sft-hf | 70.75 |
+| internlm2-chat-20b-hf | 75.53 |
+| internlm2-chat-20b-sft-hf | 75.90 |
+| llama-3-8b-instruct-hf | 78.99 |
+| llama-3-70b-instruct-hf | 89.79 |
+| llama-3-8b-instruct-lmdeploy | 76.77 |
+| llama-3-70b-instruct-lmdeploy | 89.62 |
+| mistral-7b-instruct-v0.1-hf | 62.94 |
+| mistral-7b-instruct-v0.2-hf | 67.72 |
+| mixtral-8x7b-instruct-v0.1-hf | 79.57 |
diff --git a/configs/datasets/winogrande/README.md b/configs/datasets/winogrande/README.md
new file mode 100644
index 00000000..48a5ba70
--- /dev/null
+++ b/configs/datasets/winogrande/README.md
@@ -0,0 +1,69 @@
+# WinoGrande
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
+```
+
+## Base Models
+
+| model | winogrande |
+|:------------------------:|-------------:|
+| llama-7b-turbomind | 71.19 |
+| llama-13b-turbomind | 76.16 |
+| llama-30b-turbomind | 80.66 |
+| llama-65b-turbomind | 82.16 |
+| llama-2-7b-turbomind | 74.03 |
+| llama-2-13b-turbomind | 76.48 |
+| llama-2-70b-turbomind | 83.98 |
+| llama-3-8b-turbomind | 77.82 |
+| llama-3-70b-turbomind | 83.43 |
+| internlm2-1.8b-turbomind | 66.77 |
+| internlm2-7b-turbomind | 83.50 |
+| internlm2-20b-turbomind | 84.69 |
+| qwen-1.8b-turbomind | 61.25 |
+| qwen-7b-turbomind | 72.06 |
+| qwen-14b-turbomind | 72.45 |
+| qwen-72b-turbomind | 82.56 |
+| qwen1.5-0.5b-hf | 57.38 |
+| qwen1.5-1.8b-hf | 60.46 |
+| qwen1.5-4b-hf | 65.90 |
+| qwen1.5-7b-hf | 70.01 |
+| qwen1.5-14b-hf | 72.93 |
+| qwen1.5-32b-hf | 78.69 |
+| qwen1.5-72b-hf | 80.74 |
+| qwen1.5-moe-a2-7b-hf | 71.43 |
+| mistral-7b-v0.1-hf | 78.30 |
+| mistral-7b-v0.2-hf | 77.51 |
+| mixtral-8x7b-v0.1-hf | 81.53 |
+| mixtral-8x22b-v0.1-hf | 86.50 |
+| yi-6b-hf | 74.35 |
+| yi-34b-hf | 79.01 |
+| deepseek-7b-base-hf | 74.11 |
+| deepseek-67b-base-hf | 79.32 |
+
+## Chat Models
+
+| model | winogrande |
+|:-----------------------------:|-------------:|
+| qwen1.5-0.5b-chat-hf | 50.51 |
+| qwen1.5-1.8b-chat-hf | 51.07 |
+| qwen1.5-4b-chat-hf | 57.54 |
+| qwen1.5-7b-chat-hf | 65.27 |
+| qwen1.5-14b-chat-hf | 70.09 |
+| qwen1.5-32b-chat-hf | 77.90 |
+| qwen1.5-72b-chat-hf | 80.82 |
+| qwen1.5-110b-chat-hf | 82.32 |
+| internlm2-chat-1.8b-hf | 57.62 |
+| internlm2-chat-1.8b-sft-hf | 57.93 |
+| internlm2-chat-7b-hf | 73.56 |
+| internlm2-chat-7b-sft-hf | 73.80 |
+| internlm2-chat-20b-hf | 81.06 |
+| internlm2-chat-20b-sft-hf | 81.37 |
+| llama-3-8b-instruct-hf | 66.22 |
+| llama-3-70b-instruct-hf | 81.29 |
+| llama-3-8b-instruct-lmdeploy | 66.93 |
+| llama-3-70b-instruct-lmdeploy | 81.22 |
+| mistral-7b-instruct-v0.1-hf | 58.56 |
+| mistral-7b-instruct-v0.2-hf | 59.43 |
+| mixtral-8x7b-instruct-v0.1-hf | 65.75 |
diff --git a/configs/eval_charm.py b/configs/eval_charm.py
new file mode 100644
index 00000000..c1b65ccc
--- /dev/null
+++ b/configs/eval_charm.py
@@ -0,0 +1,52 @@
+from mmengine.config import read_base
+
+with read_base():
+ from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
+ from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model
+
+ # from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+ # from models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+
+ # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+ # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+ # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+ # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+ # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
+ # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+ # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+
+ # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
+ # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+ # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+ # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+
+ # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+
+ # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+ # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+
+ # from .models.hf_llama.hf_llama3_8b_instruct import models as llama3_8b_instruct_model
+ # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_70b_instruct_model
+ from .summarizers.charm_rea import summarizer
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+work_dir = './outputs/CHARM/chat/'
+
+# dataset version metric mode internlm2-chat-7b-turbomind
+# ------------------------------------------------------------- --------- ------------- ------ -----------------------------
+# charm-reason-Direct - naive_average gen 49.51
+# charm-reason-ZH-CoT - naive_average gen 61.33
+# charm-reason-EN-CoT - naive_average gen 54.55
+# charm-reason-XLT - naive_average gen 58.46
+# charm-reason-Translate-EN - naive_average gen 56.15
+# - - - -
+# charm-reason-Chinese_Direct - naive_average gen 47.14
+# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
+# charm-reason-Chinese_EN-CoT - naive_average gen 48.31
+# charm-reason-Chinese_XLT - naive_average gen 53.57
+# charm-reason-Chinese_Translate-EN - naive_average gen 48.21
+# charm-reason-Global_Direct - naive_average gen 51.88
+# charm-reason-Global_ZH-CoT - naive_average gen 64.26
+# charm-reason-Global_EN-CoT - naive_average gen 60.79
+# charm-reason-Global_XLT - naive_average gen 63.36
+# charm-reason-Global_Translate-EN - naive_average gen 64.10
diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py
index e4a10df1..1b1a93d6 100644
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
diff --git a/configs/eval_subjective_alpacaeval_oc.py b/configs/eval_subjective_alpacaeval_oc.py
index bc5b0448..53c2f245 100644
--- a/configs/eval_subjective_alpacaeval_oc.py
+++ b/configs/eval_subjective_alpacaeval_oc.py
@@ -5,7 +5,7 @@ with read_base():
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
diff --git a/configs/eval_subjective_alpacaeval_official.py b/configs/eval_subjective_alpacaeval_official.py
index af9331b8..f1504fda 100644
--- a/configs/eval_subjective_alpacaeval_official.py
+++ b/configs/eval_subjective_alpacaeval_official.py
@@ -4,7 +4,7 @@ with read_base():
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@@ -22,6 +22,9 @@ api_meta_template = dict(
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
+
+# To run this config, please ensure to successfully installed `alpaca-eval==0.6` and `scikit-learn==1.5`
+
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
diff --git a/configs/eval_subjective_arena_hard.py b/configs/eval_subjective_arena_hard.py
index a64d1b76..4200fd04 100644
--- a/configs/eval_subjective_arena_hard.py
+++ b/configs/eval_subjective_arena_hard.py
@@ -14,7 +14,7 @@ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import ArenaHardSummarizer
with read_base():
- from .datasets.subjective.arena_hard.arena_hard_scoring import subjective_datasets
+ from .datasets.subjective.arena_hard.arena_hard_compare import subjective_datasets
api_meta_template = dict(
round=[
@@ -72,8 +72,8 @@ judge_models = [dict(
key='',
meta_template=api_meta_template,
query_per_second=1,
- max_out_len=1024,
- max_seq_len=4096,
+ max_out_len=4096,
+ max_seq_len=8192,
batch_size=10,
retry=10,
temperature = 0,
diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py
index e3d12218..e39b883f 100644
--- a/configs/eval_subjective_compassarena.py
+++ b/configs/eval_subjective_compassarena.py
@@ -6,7 +6,6 @@ with read_base():
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
diff --git a/configs/eval_subjective_compassbench.py b/configs/eval_subjective_compassbench.py
new file mode 100644
index 00000000..ebb1c4e5
--- /dev/null
+++ b/configs/eval_subjective_compassbench.py
@@ -0,0 +1,137 @@
+from os import getenv as gv
+from opencompass.models import HuggingFaceCausalLM
+from mmengine.config import read_base
+
+with read_base():
+ from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import CompassBenchSummarizer
+
+api_meta_template = dict(
+ round=[
+ dict(role='HUMAN', api_role='HUMAN'),
+ dict(role='BOT', api_role='BOT', generate=True),
+ ],
+ reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+# -------------Inference Stage ----------------------------------------
+
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='internlm2-chat-7b-hf',
+ path='internlm/internlm2-chat-7b',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ stop_words=['', '<|im_end|>'],
+ generation_kwargs=dict(
+ do_sample=True,
+ ),
+ )
+]
+
+datasets = [*subjective_datasets]
+
+infer = dict(
+ partitioner=dict(type=NaivePartitioner),
+ runner=dict(
+ type=SlurmSequentialRunner,
+ partition='llmeval',
+ quotatype='reserved',
+ max_num_workers=256,
+ task=dict(type=OpenICLInferTask),
+ ),
+)
+
+gpt4 = dict(
+ abbr='gpt4-turbo',
+ type=OpenAI,
+ path='gpt-4-1106-preview',
+ key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+ meta_template=api_meta_template,
+ query_per_second=1,
+ max_out_len=2048,
+ max_seq_len=4096,
+ batch_size=4,
+ retry=20,
+ temperature=1,
+) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [dict(
+ abbr='GPT4-Turbo',
+ type=OpenAI,
+ path='gpt-4-1106-preview',
+ key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+ meta_template=api_meta_template,
+ query_per_second=1,
+ max_out_len=1024,
+ max_seq_len=4096,
+ batch_size=2,
+ retry=20,
+ temperature=0,
+)]
+
+judge_models = [
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='internlm102b',
+ path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=4),
+ stop_words=['', '<|im_end|>'],
+ ),
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='internlm102b2',
+ path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=4),
+ stop_words=['', '<|im_end|>'],
+ ),
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='internlm102b3',
+ path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=4),
+ stop_words=['', '<|im_end|>'],
+ )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+ partitioner=dict(
+ type=SubjectiveSizePartitioner,
+ strategy='split',
+ max_task_size=10000000,
+ mode='m2n',
+ infer_order='double',
+ base_models=[gpt4],
+ compare_models=models,
+ judge_models=judge_models,
+ ),
+ runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
+ #given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
+)
+
+work_dir = 'outputs/compassbench/'
+
+summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')
diff --git a/configs/eval_subjective_creationbench.py b/configs/eval_subjective_creationbench.py
index 922225f1..2346de52 100644
--- a/configs/eval_subjective_creationbench.py
+++ b/configs/eval_subjective_creationbench.py
@@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
diff --git a/configs/eval_subjective_fofo.py b/configs/eval_subjective_fofo.py
new file mode 100644
index 00000000..1cbef1a2
--- /dev/null
+++ b/configs/eval_subjective_fofo.py
@@ -0,0 +1,69 @@
+from mmengine.config import read_base
+
+with read_base():
+ from .datasets.subjective.fofo.fofo_judge import subjective_datasets
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import FofoSummarizer
+
+api_meta_template = dict(
+ round=[
+ dict(role='HUMAN', api_role='HUMAN'),
+ dict(role='BOT', api_role='BOT', generate=True),
+ ]
+)
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='internlm2-chat-1.8b-hf',
+ path='internlm/internlm2-chat-1_8b',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ stop_words=['', '<|im_end|>'],
+ generation_kwargs=dict(
+ do_sample=True,
+ ),
+ )
+]
+
+datasets = [*subjective_datasets]
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [dict(
+ abbr='GPT4-Turbo',
+ type=OpenAI,
+ path='gpt-4-1106-preview',
+ key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+ meta_template=api_meta_template,
+ query_per_second=16,
+ max_out_len=2048,
+ max_seq_len=2048,
+ batch_size=8,
+ temperature=0,
+)]
+
+## ------------- Evaluation Configuration
+eval = dict(
+ partitioner=dict(
+ type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models,
+ ),
+ runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=FofoSummarizer, judge_type='general')
+
+work_dir = 'outputs/fofo/'
diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py
index 64e55ae1..b371b198 100644
--- a/configs/eval_subjective_judge_pandalm.py
+++ b/configs/eval_subjective_judge_pandalm.py
@@ -3,7 +3,7 @@ from mmengine.config import read_base
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py
index 0d682b65..38a442e4 100644
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@@ -4,7 +4,6 @@ with read_base():
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
diff --git a/configs/eval_subjective_mtbench101.py b/configs/eval_subjective_mtbench101.py
new file mode 100644
index 00000000..46438deb
--- /dev/null
+++ b/configs/eval_subjective_mtbench101.py
@@ -0,0 +1,94 @@
+from mmengine.config import read_base
+
+with read_base():
+ from .datasets.subjective.multiround.mtbench101_judge import subjective_datasets
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import MTBench101Summarizer
+
+# ---------------------------------------------------------------------------------------------------------
+
+api_meta_template = dict(
+ round=[
+ dict(role='SYSTEM', api_role='SYSTEM'),
+ dict(role='HUMAN', api_role='HUMAN'),
+ dict(role='BOT', api_role='BOT', generate=True),
+ ]
+)
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+ dict(
+ type=HuggingFaceChatGLM3,
+ abbr='chatglm3-6b-hf',
+ path='THUDM/chatglm3-6b',
+ tokenizer_path='THUDM/chatglm3-6b',
+ model_kwargs=dict(
+ device_map='auto',
+ trust_remote_code=True,
+ ),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ ),
+ generation_kwargs=dict(
+ do_sample=True,
+ ),
+ meta_template=api_meta_template,
+ max_out_len=4096,
+ max_seq_len=4096,
+ batch_size=1,
+ run_cfg=dict(num_gpus=2, num_procs=1),
+ )
+]
+
+datasets = [*subjective_datasets]
+
+infer = dict(
+ partitioner=dict(type=SizePartitioner, max_task_size=10000),
+ runner=dict(
+ type=SlurmSequentialRunner,
+ partition='llm_dev2',
+ quotatype='auto',
+ max_num_workers=32,
+ task=dict(type=OpenICLInferTask),
+ ),
+)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [dict(
+ abbr='GPT4-Turbo',
+ type=OpenAI,
+ path='gpt-4-1106-preview', # To compare with the official leaderboard, please use gpt-4-1106-preview
+ key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+ meta_template=api_meta_template,
+ query_per_second=16,
+ max_out_len=4096,
+ max_seq_len=4096,
+ batch_size=8,
+ temperature=0.8,
+)]
+
+## ------------- Evaluation Configuration
+
+
+
+eval = dict(
+ partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100000, mode='singlescore', models=models, judge_models=judge_models),
+ runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=MTBench101Summarizer, judge_type='single')
+
+work_dir = 'outputs/mtbench101/'
diff --git a/configs/models/hf_internlm/README.md b/configs/models/hf_internlm/README.md
new file mode 100644
index 00000000..3c4b19d0
--- /dev/null
+++ b/configs/models/hf_internlm/README.md
@@ -0,0 +1,124 @@
+# InternLM Model Details
+
+## InternLM
+
+InternLM is pre-trained on a large corpora with 1.6T tokens with a multi-phase progressive process, and then fine-tuned to align with human preferences. We also developed a training system called Uniscale-LLM for efficient large language model training. The evaluation on a number of benchmarks shows that InternLM achieves state-of-the-art performance in multiple aspects, including knowledge understanding, reading comprehension, mathematics, and coding. With such well-rounded capabilities, InternLM achieves outstanding performances on comprehensive exams, including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without resorting to external tools. On these benchmarks, InternLM not only significantly outperforms open-source models, but also obtains superior performance compared to ChatGPT. Also, InternLM demonstrates excellent capability of understanding Chinese language and Chinese culture, which makes it a suitable foundation model to support Chinese-oriented language applications.
+
+## InternLM2
+
+The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k "Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution.
+
+# Evaluation Command
+
+## Base Models
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_7b --datasets cmmlu_ppl_041cbf --debug
+python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
+```
+
+## Chat Models
+
+```bash
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets cmmlu_gen_c13365 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
+```
+
+# Benchmarks
+
+We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
+
+## Base Models
+
+| dataset | internlm2-1.8b-turbomind | internlm2-7b-turbomind | internlm2-20b-turbomind |
+|:------------:|---------------------------:|-------------------------:|--------------------------:|
+| mmlu | 45.99 | 65.84 | 67.58 |
+| cmmlu | 45.27 | 66.17 | 68.28 |
+| ceval-test | 44.79 | 63.54 | 67.28 |
+| GaokaoBench | 23.78 | 41.41 | 58.99 |
+| triviaqa | 38.42 | 69.15 | 74.03 |
+| nq | 20.66 | 41.05 | 43.55 |
+| race-high | 64.72 | 72.56 | 72.90 |
+| winogrande | 66.77 | 83.50 | 84.69 |
+| hellaswag | 44.86 | 89.52 | 91.41 |
+| bbh | 36.03 | 63.56 | 71.29 |
+| gsm8k | 30.40 | 69.98 | 76.80 |
+| math | 9.42 | 25.16 | 32.24 |
+| TheoremQA | 10.50 | 21.88 | 26.00 |
+| humaneval | 30.49 | 48.17 | 51.83 |
+| mbpp | 30.74 | 54.47 | 59.92 |
+| lcbench | 4.34 | 12.16 | 18.46 |
+| GPQA_diamond | 24.24 | 28.28 | 31.31 |
+
+## Chat Models
+
+| dataset | internlm2-chat-1.8b-hf | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-hf | internlm2-chat-20b-sft-hf |
+|:------------:|-------------------------:|-----------------------------:|-----------------------:|---------------------------:|------------------------:|----------------------------:|
+| mmlu | 47.58 | 47.44 | 63.05 | 63.33 | 67.37 | 67.34 |
+| cmmlu | 46.11 | 46.27 | 62.10 | 62.38 | 66.26 | 66.39 |
+| ceval-test | 47.04 | 47.19 | 58.75 | 58.96 | 63.12 | 63.16 |
+| GaokaoBench | 29.73 | 28.79 | 54.54 | 55.39 | 57.95 | 57.62 |
+| triviaqa | 46.69 | 46.50 | 69.54 | 70.75 | 75.53 | 75.90 |
+| nq | 19.09 | 18.14 | 28.73 | 30.78 | 28.75 | 34.10 |
+| race-high | 73.87 | 73.81 | 84.51 | 84.88 | 88.02 | 88.11 |
+| winogrande | 57.62 | 57.93 | 73.56 | 73.80 | 81.06 | 81.37 |
+| hellaswag | 60.47 | 61.58 | 84.80 | 85.21 | 88.48 | 88.95 |
+| bbh | 37.69 | 37.12 | 57.83 | 57.19 | 68.24 | 69.38 |
+| gsm8k | 39.73 | 36.85 | 69.90 | 69.83 | 75.21 | 76.95 |
+| math | 14.06 | 13.10 | 28.08 | 27.60 | 34.68 | 32.54 |
+| TheoremQA | 13.63 | 12.88 | 18.50 | 18.75 | 23.00 | 25.12 |
+| humaneval | 33.54 | 34.15 | 56.71 | 61.59 | 67.68 | 67.68 |
+| mbpp | 39.69 | 36.19 | 57.59 | 55.64 | 68.87 | 69.65 |
+| lcbench | 4.52 | 3.56 | 14.60 | 14.34 | 19.64 | 20.55 |
+| GPQA_diamond | 25.76 | 26.26 | 28.28 | 27.27 | 30.30 | 29.29 |
+| IFEval | 18.30 | 18.67 | 34.75 | 39.19 | 36.41 | 44.55 |
+
+# Citation
+
+```BibTeX
+@misc{2023internlm,
+ title={InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities},
+ author={InternLM Team},
+ howpublished = {\url{https://github.com/InternLM/InternLM-techreport}},
+ year={2023}
+}
+@misc{cai2024internlm2,
+ title={InternLM2 Technical Report},
+ author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
+ year={2024},
+ eprint={2403.17297},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
new file mode 100644
index 00000000..346e9341
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
@@ -0,0 +1,23 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+ dict(
+ type=TurboMindModelwithChatTemplate,
+ abbr='internlm2-chat-1.8b-turbomind',
+ path='internlm/internlm2-chat-1_8b',
+ engine_config=dict(
+ max_batch_size=16,
+ tp=1,
+ ),
+ gen_config=dict(
+ top_k=1,
+ temperature=1e-6,
+ top_p=0.9,
+ ),
+ max_seq_len=2048,
+ max_out_len=1024,
+ batch_size=32768,
+ run_cfg=dict(num_gpus=1),
+ stop_words=['', '<|im_end|>'],
+ )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_series.py b/configs/models/hf_internlm/lmdeploy_internlm2_series.py
index f01ee714..6c286834 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_series.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_series.py
@@ -3,7 +3,9 @@ from opencompass.models import TurboMindModel
settings = [
('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
+ ('internlm2-base-7b-turbomind', 'internlm/internlm2-base-7b', 1),
('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
+ ('internlm2-base-20b-turbomind', 'internlm/internlm2-base-20b', 2),
]
models = []
diff --git a/configs/models/hf_llama/hf_llama2_13b.py b/configs/models/hf_llama/hf_llama2_13b.py
index 4044f87e..9d51ac0c 100644
--- a/configs/models/hf_llama/hf_llama2_13b.py
+++ b/configs/models/hf_llama/hf_llama2_13b.py
@@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-13b-hf',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ run_cfg=dict(num_gpus=2),
)
]
diff --git a/configs/models/hf_llama/hf_llama2_13b_chat.py b/configs/models/hf_llama/hf_llama2_13b_chat.py
index 8460ad42..bd2b54be 100644
--- a/configs/models/hf_llama/hf_llama2_13b_chat.py
+++ b/configs/models/hf_llama/hf_llama2_13b_chat.py
@@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-13b-chat-hf',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ run_cfg=dict(num_gpus=2),
)
]
diff --git a/configs/models/hf_llama/hf_llama2_70b.py b/configs/models/hf_llama/hf_llama2_70b.py
index 97d28a4b..0aad6e66 100644
--- a/configs/models/hf_llama/hf_llama2_70b.py
+++ b/configs/models/hf_llama/hf_llama2_70b.py
@@ -7,6 +7,6 @@ models = [
path='meta-llama/Llama-2-70b-hf',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=4),
+ run_cfg=dict(num_gpus=8),
)
]
diff --git a/configs/models/hf_llama/hf_llama3_70b.py b/configs/models/hf_llama/hf_llama3_70b.py
index b3cce950..32ab4f55 100644
--- a/configs/models/hf_llama/hf_llama3_70b.py
+++ b/configs/models/hf_llama/hf_llama3_70b.py
@@ -7,6 +7,6 @@ models = [
path='meta-llama/Meta-Llama-3-70B',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ run_cfg=dict(num_gpus=8),
)
]
diff --git a/configs/models/hf_llama/hf_llama_13b.py b/configs/models/hf_llama/hf_llama_13b.py
index 70d1b9cb..6c697614 100644
--- a/configs/models/hf_llama/hf_llama_13b.py
+++ b/configs/models/hf_llama/hf_llama_13b.py
@@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-13b',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ run_cfg=dict(num_gpus=2),
)
]
diff --git a/configs/models/hf_llama/hf_llama_30b.py b/configs/models/hf_llama/hf_llama_30b.py
index 063a6927..50d0f80d 100644
--- a/configs/models/hf_llama/hf_llama_30b.py
+++ b/configs/models/hf_llama/hf_llama_30b.py
@@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-30b',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=2),
+ run_cfg=dict(num_gpus=4),
)
]
diff --git a/configs/models/hf_llama/hf_llama_65b.py b/configs/models/hf_llama/hf_llama_65b.py
index 9db5fcc9..a5761299 100644
--- a/configs/models/hf_llama/hf_llama_65b.py
+++ b/configs/models/hf_llama/hf_llama_65b.py
@@ -7,6 +7,6 @@ models = [
path='huggyllama/llama-65b',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=4),
+ run_cfg=dict(num_gpus=8),
)
]
diff --git a/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py b/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
new file mode 100644
index 00000000..fdc51c8d
--- /dev/null
+++ b/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='mistral-7b-instruct-v0.3-hf',
+ path='mistralai/Mistral-7B-Instruct-v0.3',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/mistral/hf_mistral_7b_v0_3.py b/configs/models/mistral/hf_mistral_7b_v0_3.py
new file mode 100644
index 00000000..90d9a829
--- /dev/null
+++ b/configs/models/mistral/hf_mistral_7b_v0_3.py
@@ -0,0 +1,13 @@
+from opencompass.models import HuggingFaceBaseModel
+
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='mistral-7b-v0.3-hf',
+ path='mistralai/Mistral-7B-v0.3',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
index 5f4eee67..db4094f6 100644
--- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
+++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
index ca202d1b..c1dd32ba 100644
--- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
+++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
index 03975355..2bbc0f0d 100644
--- a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
+++ b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
@@ -20,7 +20,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
diff --git a/configs/models/opt/hf_opt_125m.py b/configs/models/opt/hf_opt_125m.py
index ec0c68d3..b813d28e 100644
--- a/configs/models/opt/hf_opt_125m.py
+++ b/configs/models/opt/hf_opt_125m.py
@@ -1,12 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
-models = [
- dict(
- type=HuggingFaceBaseModel,
- abbr='opt-125m-hf',
- path='facebook/opt-125m',
- max_out_len=1024,
- batch_size=64,
- run_cfg=dict(num_gpus=1),
- )
-]
+opt125m = dict(
+ type=HuggingFaceBaseModel,
+ abbr='opt-125m-hf',
+ path='facebook/opt-125m',
+ max_out_len=1024,
+ batch_size=64,
+ run_cfg=dict(num_gpus=1),
+)
+
+models = [opt125m]
diff --git a/configs/models/opt/hf_opt_350m.py b/configs/models/opt/hf_opt_350m.py
index 6a25db0a..57aa609c 100644
--- a/configs/models/opt/hf_opt_350m.py
+++ b/configs/models/opt/hf_opt_350m.py
@@ -1,12 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
-models = [
- dict(
- type=HuggingFaceBaseModel,
- abbr='opt-350m-hf',
- path='facebook/opt-350m',
- max_out_len=1024,
- batch_size=32,
- run_cfg=dict(num_gpus=1),
- )
-]
+opt350m = dict(
+ type=HuggingFaceBaseModel,
+ abbr='opt-350m-hf',
+ path='facebook/opt-350m',
+ max_out_len=1024,
+ batch_size=32,
+ run_cfg=dict(num_gpus=1),
+)
+
+models = [opt350m]
diff --git a/configs/models/others/vllm_orionstar_14b_longchat.py b/configs/models/others/vllm_orionstar_14b_longchat.py
index 58cdea8d..2c90f7a3 100644
--- a/configs/models/others/vllm_orionstar_14b_longchat.py
+++ b/configs/models/others/vllm_orionstar_14b_longchat.py
@@ -21,6 +21,6 @@ models = [
max_seq_len=4096,
batch_size=32,
run_cfg=dict(num_gpus=4, num_procs=1),
- end_str='<|endoftext|>',
+ stop_words=['<|endoftext|>'],
)
]
diff --git a/configs/models/qwen/README.md b/configs/models/qwen/README.md
new file mode 100644
index 00000000..f8dbd8ad
--- /dev/null
+++ b/configs/models/qwen/README.md
@@ -0,0 +1,142 @@
+# Qwen Model Details
+
+## Qwen
+
+Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
+
+## Qwen1.5
+
+Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. In comparison with the previous released Qwen, the improvements include:
+
+- 8 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, 32B and 72B dense models, and an MoE model of 14B with 2.7B activated;
+- Significant performance improvement in human preference for chat models;
+- Multilingual support of both base and chat models;
+- Stable support of 32K context length for models of all sizes
+- No need of trust_remote_code.
+
+# Evaluation Command
+
+## Base Models
+
+```bash
+python3 run.py --models hf_qwen1_5_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_qwen1_5_7b --datasets cmmlu_ppl_041cbf --debug
+python3 run.py --models hf_qwen1_5_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_qwen1_5_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug
+```
+
+## Chat Models
+
+```bash
+python3 run.py --models hf_qwen1_5_7b_chat --datasets mmlu_gen_4d595a --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets cmmlu_gen_c13365 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets ceval_internal_gen_2daf24 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets nq_open_1shot_gen_01cf41 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets race_gen_69ee4f --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets winogrande_5shot_gen_b36770 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets hellaswag_10shot_gen_e42710 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets bbh_gen_5b92b0 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets gsm8k_gen_1d7fe4 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets math_0shot_gen_393424 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets humaneval_gen_8e312c --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets gpqa_gen_4baadb --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets IFEval_gen_3321a3 --debug
+```
+
+# Benchmarks
+
+We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
+
+## Base Models
+
+| dataset | qwen-1.8b-turbomind | qwen-7b-turbomind | qwen-14b-turbomind | qwen-72b-turbomind |
+|:------------:|----------------------:|--------------------:|---------------------:|---------------------:|
+| mmlu | 46.61 | 59.75 | 67.85 | 77.36 |
+| cmmlu | 51.98 | 62.10 | 70.05 | 83.32 |
+| ceval-test | 54.24 | 62.06 | 70.33 | 83.25 |
+| GaokaoBench | 22.11 | 35.32 | 54.07 | 77.56 |
+| triviaqa | 22.76 | 53.61 | 49.72 | 79.13 |
+| nq | 5.68 | 17.87 | 13.77 | 18.20 |
+| race-high | 63.09 | 80.30 | 88.11 | 90.62 |
+| winogrande | 61.25 | 72.06 | 72.45 | 82.56 |
+| hellaswag | 38.04 | 64.62 | 85.88 | 90.40 |
+| bbh | 22.53 | 45.89 | 56.75 | 63.35 |
+| gsm8k | 23.73 | 54.36 | 61.64 | 79.68 |
+| math | 6.30 | 15.56 | 30.38 | 44.18 |
+| TheoremQA | 9.38 | 15.00 | 21.62 | 27.12 |
+| humaneval | 16.46 | 23.78 | 23.78 | 66.46 |
+| mbpp | 2.72 | 46.69 | 55.64 | 65.76 |
+| lcbench | 1.82 | 4.95 | 8.86 | 16.86 |
+| GPQA_diamond | 28.79 | 24.75 | 27.78 | 31.31 |
+
+| dataset | qwen1.5-0.5b-hf | qwen1.5-1.8b-hf | qwen1.5-4b-hf | qwen1.5-7b-hf | qwen1.5-14b-hf | qwen1.5-32b-hf | qwen1.5-72b-hf |
+|:------------:|------------------:|------------------:|----------------:|----------------:|-----------------:|-----------------:|-----------------:|
+| mmlu | 39.98 | 47.14 | 57.03 | 62.15 | 69.10 | 73.88 | 77.02 |
+| cmmlu | 46.05 | 57.45 | 66.38 | 71.86 | 76.95 | 81.58 | 83.00 |
+| ceval-test | 48.36 | 58.67 | 66.55 | 72.49 | 76.93 | 82.50 | 83.03 |
+| GaokaoBench | 30.67 | 35.66 | 54.31 | 65.99 | 66.60 | 79.01 | 80.26 |
+| triviaqa | 21.24 | 34.32 | 44.59 | 56.60 | 59.96 | 56.20 | 77.81 |
+| nq | 6.01 | 10.28 | 15.73 | 18.61 | 16.07 | 21.75 | 20.53 |
+| race-high | 54.66 | 67.27 | 78.50 | 82.73 | 87.99 | 90.57 | 90.45 |
+| winogrande | 57.38 | 60.46 | 65.90 | 70.01 | 72.93 | 78.69 | 80.74 |
+| hellaswag | 29.19 | 42.32 | 55.89 | 68.51 | 83.86 | 87.28 | 90.41 |
+| bbh | 20.54 | 27.01 | 34.81 | 39.87 | 50.38 | 67.47 | 58.81 |
+| gsm8k | 13.27 | 34.87 | 47.61 | 54.36 | 63.53 | 72.71 | 79.53 |
+| math | 4.16 | 11.32 | 17.50 | 17.34 | 36.18 | 45.74 | 41.56 |
+| TheoremQA | 5.88 | 12.00 | 13.75 | 4.25 | 12.62 | 26.62 | 26.62 |
+| humaneval | 8.54 | 23.17 | 41.46 | 53.05 | 57.32 | 70.12 | 65.85 |
+| mbpp | 5.06 | 15.95 | 45.91 | 52.14 | 52.14 | 59.14 | 61.09 |
+| lcbench | 0.87 | 2.00 | 5.65 | 6.69 | 12.69 | 14.34 | 15.29 |
+| GPQA_diamond | 23.74 | 28.79 | 23.23 | 20.71 | 32.32 | 30.81 | 31.82 |
+
+## Chat Models
+
+| dataset | qwen1.5-0.5b-chat-hf | qwen1.5-1.8b-chat-hf | qwen1.5-4b-chat-hf | qwen1.5-7b-chat-hf | qwen1.5-14b-chat-hf | qwen1.5-32b-chat-hf | qwen1.5-72b-chat-hf | qwen1.5-110b-chat-hf |
+|:------------:|-----------------------:|-----------------------:|---------------------:|---------------------:|----------------------:|----------------------:|----------------------:|-----------------------:|
+| mmlu | 35.32 | 45.62 | 55.90 | 61.79 | 67.96 | 75.36 | 77.24 | 77.95 |
+| cmmlu | 31.55 | 48.93 | 58.53 | 68.78 | 75.07 | 80.39 | 82.48 | 86.46 |
+| ceval-test | 36.88 | 55.17 | 61.54 | 68.71 | 74.80 | 80.47 | 81.53 | 87.33 |
+| GaokaoBench | 21.51 | 46.19 | 59.11 | 70.55 | 80.39 | 86.15 | 88.58 | 89.59 |
+| triviaqa | 19.84 | 35.81 | 48.93 | 53.65 | 62.58 | 74.72 | 83.25 | 86.20 |
+| nq | 7.42 | 10.22 | 19.31 | 16.87 | 20.53 | 25.26 | 35.21 | 36.98 |
+| race-high | 49.03 | 66.24 | 73.53 | 83.28 | 87.51 | 91.22 | 91.11 | 92.31 |
+| winogrande | 50.51 | 51.07 | 57.54 | 65.27 | 70.09 | 77.90 | 80.82 | 82.32 |
+| hellaswag | 29.60 | 41.71 | 60.45 | 71.58 | 79.70 | 88.56 | 89.37 | 91.11 |
+| bbh | 24.12 | 26.82 | 43.15 | 38.12 | 55.38 | 69.28 | 72.97 | 71.04 |
+| gsm8k | 8.79 | 27.60 | 47.61 | 56.25 | 64.90 | 79.91 | 77.03 | 79.53 |
+| math | 0.56 | 4.94 | 7.34 | 22.14 | 32.22 | 41.80 | 45.22 | 54.38 |
+| TheoremQA | 9.00 | 9.25 | 13.88 | 12.25 | 13.63 | 19.25 | 22.75 | 17.50 |
+| humaneval | 9.15 | 15.85 | 30.49 | 40.85 | 50.00 | 57.93 | 60.37 | 65.24 |
+| mbpp | 11.28 | 22.57 | 43.58 | 50.58 | 56.03 | 65.37 | 66.93 | 68.48 |
+| lcbench | 0.00 | 1.65 | 5.56 | 8.78 | 14.42 | 10.78 | 18.77 | 34.58 |
+| GPQA_diamond | 19.70 | 29.80 | 25.25 | 31.82 | 30.30 | 31.31 | 32.83 | 35.86 |
+| IFEval | 13.12 | 16.08 | 25.51 | 38.82 | 42.51 | 49.54 | 51.02 | 55.08 |
+
+# Citation
+
+```BibTeX
+@article{qwen,
+ title={Qwen Technical Report},
+ author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
+ journal={arXiv preprint arXiv:2309.16609},
+ year={2023}
+}
+```
diff --git a/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py b/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py
index 77f0fc57..0e112c8b 100644
--- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py
+++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py
@@ -1,25 +1,12 @@
-from opencompass.models import HuggingFaceCausalLM
+from opencompass.models import HuggingFaceBaseModel
models = [
dict(
- type=HuggingFaceCausalLM,
- abbr='qwen1.5-moe-a2-7b-hf',
+ type=HuggingFaceBaseModel,
+ abbr='qwen1.5-moe-a2.7b-hf',
path='Qwen/Qwen1.5-MoE-A2.7B',
- tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B',
- model_kwargs=dict(
- device_map='auto',
- trust_remote_code=True
- ),
- tokenizer_kwargs=dict(
- padding_side='left',
- truncation_side='left',
- trust_remote_code=True,
- use_fast=False,
- ),
- pad_token_id=151645,
- max_out_len=100,
- max_seq_len=2048,
+ max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1, num_procs=1),
+ run_cfg=dict(num_gpus=1),
)
]
diff --git a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
index ebc09fac..d33ea704 100644
--- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
@@ -1,33 +1,12 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
- round=[
- dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
- dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
- ],
-)
+from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
- type=HuggingFaceCausalLM,
- abbr='qwen1.5-moe-a2-7b-chat-hf',
+ type=HuggingFacewithChatTemplate,
+ abbr='qwen1.5-moe-a2.7b-chat-hf',
path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
- model_kwargs=dict(
- device_map='auto',
- trust_remote_code=True
- ),
- tokenizer_kwargs=dict(
- padding_side='left',
- truncation_side='left',
- trust_remote_code=True,
- use_fast=False,
- ),
- meta_template=_meta_template,
- max_out_len=100,
- max_seq_len=2048,
+ max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=4, num_procs=1),
- end_str='<|im_end|>',
- batch_padding=True,
+ run_cfg=dict(num_gpus=1),
)
]
diff --git a/configs/models/qwen/hf_qwen2_0_5b.py b/configs/models/qwen/hf_qwen2_0_5b.py
new file mode 100644
index 00000000..22305d6f
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_0_5b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='qwen2-0.5b-hf',
+ path='Qwen/Qwen2-0.5B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/qwen/hf_qwen2_1_5b.py b/configs/models/qwen/hf_qwen2_1_5b.py
new file mode 100644
index 00000000..2c9d488a
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_1_5b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='qwen2-1.5b-hf',
+ path='Qwen/Qwen2-1.5B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/qwen/hf_qwen2_72b.py b/configs/models/qwen/hf_qwen2_72b.py
new file mode 100644
index 00000000..97cc74f6
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_72b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='qwen2-72b-hf',
+ path='Qwen/Qwen2-72B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=8),
+ )
+]
diff --git a/configs/models/qwen/hf_qwen2_7b.py b/configs/models/qwen/hf_qwen2_7b.py
new file mode 100644
index 00000000..a861151f
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_7b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='qwen2-7b-hf',
+ path='Qwen/Qwen2-7B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/qwen/hf_qwen2_moe_57b_a14b.py b/configs/models/qwen/hf_qwen2_moe_57b_a14b.py
new file mode 100644
index 00000000..adc0ba8f
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_moe_57b_a14b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='qwen2-moe-57b-a14b-hf',
+ path='Qwen/Qwen2-MoE-57B-A14B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=4),
+ )
+]
diff --git a/configs/models/qwen/hf_qwen_14b.py b/configs/models/qwen/hf_qwen_14b.py
index 8c15c032..ac098583 100644
--- a/configs/models/qwen/hf_qwen_14b.py
+++ b/configs/models/qwen/hf_qwen_14b.py
@@ -7,6 +7,6 @@ models = [
path='Qwen/Qwen-14B',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ run_cfg=dict(num_gpus=2),
)
]
diff --git a/configs/models/qwen/hf_qwen_14b_chat.py b/configs/models/qwen/hf_qwen_14b_chat.py
index 4d9cccee..781451e1 100644
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+ round=[
+ dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+ dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+ ],
+)
models = [
dict(
- type=HuggingFacewithChatTemplate,
+ type=HuggingFaceCausalLM,
abbr='qwen-14b-chat-hf',
path='Qwen/Qwen-14B-Chat',
+ model_kwargs=dict(device_map='auto', trust_remote_code=True),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ use_fast=False,
+ ),
+ pad_token_id=151643,
max_out_len=1024,
+ max_seq_len=8192,
batch_size=8,
- run_cfg=dict(num_gpus=1),
+ batch_padding=True,
+ meta_template=_meta_template,
+ run_cfg=dict(num_gpus=2),
+ end_str='<|im_end|>',
)
]
diff --git a/configs/models/qwen/hf_qwen_1_8b_chat.py b/configs/models/qwen/hf_qwen_1_8b_chat.py
index 1838a04a..05b77564 100644
--- a/configs/models/qwen/hf_qwen_1_8b_chat.py
+++ b/configs/models/qwen/hf_qwen_1_8b_chat.py
@@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+ round=[
+ dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+ dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+ ],
+)
models = [
dict(
- type=HuggingFacewithChatTemplate,
+ type=HuggingFaceCausalLM,
abbr='qwen-1.8b-chat-hf',
path='Qwen/Qwen-1_8B-Chat',
+ model_kwargs=dict(device_map='auto', trust_remote_code=True),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ use_fast=False,
+ ),
+ pad_token_id=151643,
max_out_len=1024,
+ max_seq_len=8192,
batch_size=8,
+ batch_padding=True,
+ meta_template=_meta_template,
run_cfg=dict(num_gpus=1),
+ end_str='<|im_end|>',
)
]
diff --git a/configs/models/qwen/hf_qwen_72b.py b/configs/models/qwen/hf_qwen_72b.py
index 325315b8..63eacfef 100644
--- a/configs/models/qwen/hf_qwen_72b.py
+++ b/configs/models/qwen/hf_qwen_72b.py
@@ -7,6 +7,6 @@ models = [
path='Qwen/Qwen-72B',
max_out_len=1024,
batch_size=8,
- run_cfg=dict(num_gpus=4),
+ run_cfg=dict(num_gpus=8),
)
]
diff --git a/configs/models/qwen/hf_qwen_72b_chat.py b/configs/models/qwen/hf_qwen_72b_chat.py
index 255aeb5d..bf214abb 100644
--- a/configs/models/qwen/hf_qwen_72b_chat.py
+++ b/configs/models/qwen/hf_qwen_72b_chat.py
@@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+ round=[
+ dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+ dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+ ],
+)
models = [
dict(
- type=HuggingFacewithChatTemplate,
+ type=HuggingFaceCausalLM,
abbr='qwen-72b-chat-hf',
path='Qwen/Qwen-72B-Chat',
+ model_kwargs=dict(device_map='auto', trust_remote_code=True),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ use_fast=False,
+ ),
+ pad_token_id=151643,
max_out_len=1024,
+ max_seq_len=8192,
batch_size=8,
- run_cfg=dict(num_gpus=4),
+ batch_padding=True,
+ meta_template=_meta_template,
+ run_cfg=dict(num_gpus=8),
+ end_str='<|im_end|>',
)
]
diff --git a/configs/models/qwen/hf_qwen_7b_chat.py b/configs/models/qwen/hf_qwen_7b_chat.py
index e5479fb6..9fe97f83 100644
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+ round=[
+ dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+ dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+ ],
+)
models = [
dict(
- type=HuggingFacewithChatTemplate,
+ type=HuggingFaceCausalLM,
abbr='qwen-7b-chat-hf',
path='Qwen/Qwen-7B-Chat',
+ model_kwargs=dict(device_map='auto', trust_remote_code=True),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ use_fast=False,
+ ),
+ pad_token_id=151643,
max_out_len=1024,
+ max_seq_len=8192,
batch_size=8,
+ batch_padding=True,
+ meta_template=_meta_template,
run_cfg=dict(num_gpus=1),
+ end_str='<|im_end|>',
)
]
diff --git a/configs/models/qwen/vllm_qwen1_5_14b_chat.py b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
index e4736857..b5e97d3d 100644
--- a/configs/models/qwen/vllm_qwen1_5_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='<|im_end|>',
+ stop_words=['<|im_end|>'],
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
diff --git a/configs/models/qwen/vllm_qwen1_5_72b_chat.py b/configs/models/qwen/vllm_qwen1_5_72b_chat.py
index 921e220a..2cb2ca53 100644
--- a/configs/models/qwen/vllm_qwen1_5_72b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_72b_chat.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='<|im_end|>',
+ stop_words=['<|im_end|>'],
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
diff --git a/configs/models/qwen/vllm_qwen_14b_chat.py b/configs/models/qwen/vllm_qwen_14b_chat.py
index 830010ef..53e221aa 100644
--- a/configs/models/qwen/vllm_qwen_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen_14b_chat.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='<|im_end|>',
+ stop_words=['<|im_end|>'],
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
diff --git a/configs/models/qwen/vllm_qwen_72b_chat.py b/configs/models/qwen/vllm_qwen_72b_chat.py
index cf2caa2c..fb727913 100644
--- a/configs/models/qwen/vllm_qwen_72b_chat.py
+++ b/configs/models/qwen/vllm_qwen_72b_chat.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='<|im_end|>',
+ stop_words=['<|im_end|>'],
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
diff --git a/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py b/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py
index b15b4921..e26b3ff1 100644
--- a/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py
+++ b/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py
@@ -17,7 +17,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
diff --git a/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py b/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py
index b45646f0..ca261f13 100644
--- a/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py
+++ b/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py
@@ -17,7 +17,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
index 5df42752..2e7fafeb 100644
--- a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+++ b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
@@ -18,7 +18,7 @@ models = [
max_seq_len=2048,
batch_size=1,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py b/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
index 8c25d010..a722593a 100644
--- a/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
+++ b/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
@@ -19,7 +19,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
diff --git a/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py b/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
index b7fe7f02..60b33c86 100644
--- a/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
+++ b/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
@@ -18,7 +18,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/models/yi/hf_yi_1_5_34b.py b/configs/models/yi/hf_yi_1_5_34b.py
new file mode 100644
index 00000000..38cd6424
--- /dev/null
+++ b/configs/models/yi/hf_yi_1_5_34b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='yi-1.5-34b-hf',
+ path='01-ai/Yi-1.5-34B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=2),
+ )
+]
diff --git a/configs/models/yi/hf_yi_1_5_6b.py b/configs/models/yi/hf_yi_1_5_6b.py
new file mode 100644
index 00000000..df4fd23c
--- /dev/null
+++ b/configs/models/yi/hf_yi_1_5_6b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='yi-1.5-6b-hf',
+ path='01-ai/Yi-1.5-6B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/yi/hf_yi_1_5_9b.py b/configs/models/yi/hf_yi_1_5_9b.py
new file mode 100644
index 00000000..c38506a4
--- /dev/null
+++ b/configs/models/yi/hf_yi_1_5_9b.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+ dict(
+ type=HuggingFaceBaseModel,
+ abbr='yi-1.5-9b-hf',
+ path='01-ai/Yi-1.5-9B',
+ max_out_len=1024,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1),
+ )
+]
diff --git a/configs/models/zephyr/vllm_zephyr_7b_beta.py b/configs/models/zephyr/vllm_zephyr_7b_beta.py
index 956ac073..499b58b6 100644
--- a/configs/models/zephyr/vllm_zephyr_7b_beta.py
+++ b/configs/models/zephyr/vllm_zephyr_7b_beta.py
@@ -17,7 +17,7 @@ models = [
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
- end_str='',
+ stop_words=[''],
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
diff --git a/configs/subjective/model_cfg.py b/configs/subjective/model_cfg.py
index cac3e9df..4b95f2a7 100644
--- a/configs/subjective/model_cfg.py
+++ b/configs/subjective/model_cfg.py
@@ -1,5 +1,5 @@
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@@ -44,7 +44,7 @@ models = [
judge_model = dict(
abbr='GPT4-Turbo',
- type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
+ type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='',
meta_template=api_meta_template,
@@ -70,7 +70,7 @@ runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalT
gpt4 = dict(
abbr='gpt4-turbo',
- type=OpenAIAllesAPIN,
+ type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
diff --git a/configs/summarizers/charm_reason.py b/configs/summarizers/charm_reason.py
new file mode 100644
index 00000000..8e12e901
--- /dev/null
+++ b/configs/summarizers/charm_reason.py
@@ -0,0 +1,98 @@
+from mmengine.config import reasond_base
+
+with reasond_base():
+ from .groups.charm_reason import charm_reason_summary_groups
+
+summarizer = dict(
+ dataset_abbrs=[
+ 'charm-reason-Direct',
+ 'charm-reason-ZH-CoT',
+ 'charm-reason-EN-CoT',
+ 'charm-reason-XLT',
+ 'charm-reason-Translate-EN',
+ '',
+ 'charm-reason-Chinese_Direct',
+ 'charm-reason-Chinese_ZH-CoT',
+ 'charm-reason-Chinese_EN-CoT',
+ 'charm-reason-Chinese_XLT',
+ 'charm-reason-Chinese_Translate-EN',
+ 'charm-reason-Global_Direct',
+ 'charm-reason-Global_ZH-CoT',
+ 'charm-reason-Global_EN-CoT',
+ 'charm-reason-Global_XLT',
+ 'charm-reason-Global_Translate-EN',
+ '',
+ 'charm-reason-Chinese_Anachronisms_Judgment_Direct',
+ 'charm-reason-Chinese_Movie_and_Music_Recommendation_Direct',
+ 'charm-reason-Chinese_Natural_Language_Inference_Direct',
+ 'charm-reason-Chinese_Reading_Comprehension_Direct',
+ 'charm-reason-Chinese_Sequence_Understanding_Direct',
+ 'charm-reason-Chinese_Sport_Understanding_Direct',
+ 'charm-reason-Chinese_Time_Understanding_Direct',
+ 'charm-reason-Global_Anachronisms_Judgment_Direct',
+ 'charm-reason-Global_Movie_and_Music_Recommendation_Direct',
+ 'charm-reason-Global_Natural_Language_Inference_Direct',
+ 'charm-reason-Global_Reading_Comprehension_Direct',
+ 'charm-reason-Global_Sequence_Understanding_Direct',
+ 'charm-reason-Global_Sport_Understanding_Direct',
+ 'charm-reason-Global_Time_Understanding_Direct',
+ 'charm-reason-Chinese_Anachronisms_Judgment_ZH-CoT',
+ 'charm-reason-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
+ 'charm-reason-Chinese_Natural_Language_Inference_ZH-CoT',
+ 'charm-reason-Chinese_Reading_Comprehension_ZH-CoT',
+ 'charm-reason-Chinese_Sequence_Understanding_ZH-CoT',
+ 'charm-reason-Chinese_Sport_Understanding_ZH-CoT',
+ 'charm-reason-Chinese_Time_Understanding_ZH-CoT',
+ 'charm-reason-Global_Anachronisms_Judgment_ZH-CoT',
+ 'charm-reason-Global_Movie_and_Music_Recommendation_ZH-CoT',
+ 'charm-reason-Global_Natural_Language_Inference_ZH-CoT',
+ 'charm-reason-Global_Reading_Comprehension_ZH-CoT',
+ 'charm-reason-Global_Sequence_Understanding_ZH-CoT',
+ 'charm-reason-Global_Sport_Understanding_ZH-CoT',
+ 'charm-reason-Global_Time_Understanding_ZH-CoT',
+ 'charm-reason-Chinese_Anachronisms_Judgment_EN-CoT',
+ 'charm-reason-Chinese_Movie_and_Music_Recommendation_EN-CoT',
+ 'charm-reason-Chinese_Natural_Language_Inference_EN-CoT',
+ 'charm-reason-Chinese_Reading_Comprehension_EN-CoT',
+ 'charm-reason-Chinese_Sequence_Understanding_EN-CoT',
+ 'charm-reason-Chinese_Sport_Understanding_EN-CoT',
+ 'charm-reason-Chinese_Time_Understanding_EN-CoT',
+ 'charm-reason-Global_Anachronisms_Judgment_EN-CoT',
+ 'charm-reason-Global_Movie_and_Music_Recommendation_EN-CoT',
+ 'charm-reason-Global_Natural_Language_Inference_EN-CoT',
+ 'charm-reason-Global_Reading_Comprehension_EN-CoT',
+ 'charm-reason-Global_Sequence_Understanding_EN-CoT',
+ 'charm-reason-Global_Sport_Understanding_EN-CoT',
+ 'charm-reason-Global_Time_Understanding_EN-CoT',
+ 'charm-reason-Chinese_Anachronisms_Judgment_XLT',
+ 'charm-reason-Chinese_Movie_and_Music_Recommendation_XLT',
+ 'charm-reason-Chinese_Natural_Language_Inference_XLT',
+ 'charm-reason-Chinese_Reading_Comprehension_XLT',
+ 'charm-reason-Chinese_Sequence_Understanding_XLT',
+ 'charm-reason-Chinese_Sport_Understanding_XLT',
+ 'charm-reason-Chinese_Time_Understanding_XLT',
+ 'charm-reason-Global_Anachronisms_Judgment_XLT',
+ 'charm-reason-Global_Movie_and_Music_Recommendation_XLT',
+ 'charm-reason-Global_Natural_Language_Inference_XLT',
+ 'charm-reason-Global_Reading_Comprehension_XLT',
+ 'charm-reason-Global_Sequence_Understanding_XLT',
+ 'charm-reason-Global_Sport_Understanding_XLT',
+ 'charm-reason-Global_Time_Understanding_XLT',
+ 'charm-reason-Chinese_Anachronisms_Judgment_Translate-EN',
+ 'charm-reason-Chinese_Movie_and_Music_Recommendation_Translate-EN',
+ 'charm-reason-Chinese_Natural_Language_Inference_Translate-EN',
+ 'charm-reason-Chinese_Reading_Comprehension_Translate-EN',
+ 'charm-reason-Chinese_Sequence_Understanding_Translate-EN',
+ 'charm-reason-Chinese_Sport_Understanding_Translate-EN',
+ 'charm-reason-Chinese_Time_Understanding_Translate-EN',
+ 'charm-reason-Global_Anachronisms_Judgment_Translate-EN',
+ 'charm-reason-Global_Movie_and_Music_Recommendation_Translate-EN',
+ 'charm-reason-Global_Natural_Language_Inference_Translate-EN',
+ 'charm-reason-Global_Reading_Comprehension_Translate-EN',
+ 'charm-reason-Global_Sequence_Understanding_Translate-EN',
+ 'charm-reason-Global_Sport_Understanding_Translate-EN',
+ 'charm-reason-Global_Time_Understanding_Translate-EN',
+ ],
+ summary_groups=sum(
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+)
diff --git a/configs/summarizers/chat_OC15_multi_faceted.py b/configs/summarizers/chat_OC15_multi_faceted.py
index 72dfa4cb..c6fc5812 100644
--- a/configs/summarizers/chat_OC15_multi_faceted.py
+++ b/configs/summarizers/chat_OC15_multi_faceted.py
@@ -122,10 +122,10 @@ IFEval_dataset_abbrs = [
['IFEval', 'Inst-level-loose-accuracy'],
]
-
summarizer = dict(
type=MultiFacetedSummarizer,
dataset_abbrs_list=[
+ {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
{'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
{'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
{'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
@@ -143,7 +143,6 @@ summarizer = dict(
{'name': 'humaneval', 'dataset_abbrs': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'GPQA', 'dataset_abbrs': [['GPQA_diamond', 'accuracy']]},
{'name': 'IFEval', 'dataset_abbrs': IFEval_dataset_abbrs},
- {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
diff --git a/configs/summarizers/groups/charm_reason.py b/configs/summarizers/groups/charm_reason.py
new file mode 100644
index 00000000..3d1f4c19
--- /dev/null
+++ b/configs/summarizers/groups/charm_reason.py
@@ -0,0 +1,35 @@
+charm_tasks = [
+ 'Anachronisms_Judgment',
+ 'Movie_and_Music_Recommendation',
+ 'Natural_Language_Inference',
+ 'Reading_Comprehension',
+ 'Sequence_Understanding',
+ 'Sport_Understanding',
+ 'Time_Understanding',
+]
+regions = [
+ 'Chinese',
+ 'Global',
+]
+prompts = [
+ 'Direct',
+ 'ZH-CoT',
+ 'EN-CoT',
+ 'XLT',
+ 'Translate-EN',
+]
+
+
+charm_reason_summary_groups = []
+for prompt in prompts:
+ for region in regions:
+ subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
+ charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
+
+for prompt in prompts:
+ subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
+ charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
+
+charm_reason_summary_groups.append(
+ {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
+)
diff --git a/configs/summarizers/groups/mathbench_2024.py b/configs/summarizers/groups/mathbench_2024.py
new file mode 100644
index 00000000..05a11e35
--- /dev/null
+++ b/configs/summarizers/groups/mathbench_2024.py
@@ -0,0 +1,26 @@
+
+mathbench_2024_wocircular_summary_groups = [
+ {'name': 'college', 'subsets': ['college-single_choice_cn', 'college-single_choice_en']},
+ {'name': 'high', 'subsets': ['high-single_choice_cn', 'high-single_choice_en']},
+ {'name': 'middle', 'subsets': ['middle-single_choice_cn', 'middle-single_choice_en']},
+ {'name': 'primary', 'subsets': ['primary-cloze_cn', 'primary-cloze_en']},
+ {'name': 'cn', 'subsets': ['college-single_choice_cn', 'high-single_choice_cn', 'middle-single_choice_cn', 'primary-cloze_cn']},
+ {'name': 'en', 'subsets': ['college-single_choice_en', 'high-single_choice_en', 'middle-single_choice_en', 'primary-cloze_en']},
+ {'name': 'a', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic-cloze_en']},
+
+ {'name': 'college_knowledge', 'subsets': ['college_knowledge-single_choice_cn', 'college_knowledge-single_choice_en']},
+ {'name': 'high_knowledge', 'subsets': ['high_knowledge-single_choice_cn', 'high_knowledge-single_choice_en']},
+ {'name': 'middle_knowledge', 'subsets': ['middle_knowledge-single_choice_cn', 'middle_knowledge-single_choice_en']},
+ {'name': 'primary_knowledge', 'subsets': ['primary_knowledge-single_choice_cn', 'primary_knowledge-single_choice_en']},
+ {'name': 'knowledge-cn', 'subsets': ['college_knowledge-single_choice_cn', 'high_knowledge-single_choice_cn', 'middle_knowledge-single_choice_cn', 'primary_knowledge-single_choice_cn']},
+ {'name': 'knowledge-en', 'subsets': ['college_knowledge-single_choice_en', 'high_knowledge-single_choice_en', 'middle_knowledge-single_choice_en', 'primary_knowledge-single_choice_en']},
+ {'name': 't', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
+
+ {'name': 'overall', 'subsets': ['a', 't']},
+]
+
+for g in mathbench_2024_wocircular_summary_groups:
+ g['name'] = 'mathbench-wocircular-' + g['name']
+ g['subsets'] = ['mathbench-wocircular-' + s for s in g['subsets']]
+
+mathbench_2024_summary_groups = mathbench_2024_wocircular_summary_groups
diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md
index edaa795a..570605c7 100644
--- a/docs/en/get_started/faq.md
+++ b/docs/en/get_started/faq.md
@@ -37,9 +37,9 @@ This is a complex issue that needs to be explained from both the supply and dema
The supply side refers to how many tasks are being run. A task is a combination of a model and a dataset, and it primarily depends on how many models and datasets need to be tested. Additionally, since OpenCompass splits a larger task into multiple smaller tasks, the number of data entries per sub-task (`--max-partition-size`) also affects the number of tasks. (The `--max-partition-size` is proportional to the actual number of data entries, but the relationship is not 1:1).
-The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--num-gpus` to specify how many GPUs each instance uses. Note that `--num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs.
+The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--hf-num-gpus` to specify how many GPUs each instance uses. Note that `--hf-num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs.
-In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--num-gpus` to the minimum value that meets the demand and do not adjust it further.**
+In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--hf-num-gpus` to the minimum value that meets the demand and do not adjust it further.**
### How do I control the number of GPUs that OpenCompass occupies?
@@ -114,17 +114,8 @@ Hence, if users find that the number of tasks greatly exceeds the available GPUs
### How to use the downloaded huggingface models?
-If you have already download the checkpoints of the model, you can specify the local path of the model and tokenizer, and add `trust_remote_code=True` for `--model-kwargs` and `--tokenizer-kwargs`. For example
+If you have already download the checkpoints of the model, you can specify the local path of the model. For example
```bash
-python run.py --datasets siqa_gen winograd_ppl \
---hf-path /path/to/model \ # HuggingFace 模型地址
---tokenizer-path /path/to/model \ # HuggingFace 模型地址
---model-kwargs device_map='auto' trust_remote_code=True \ # 构造 model 的参数
---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False trust_remote_code=True \ # 构造 tokenizer 的参数
---max-out-len 100 \ # 模型能接受的最大序列长度
---max-seq-len 2048 \ # 最长生成 token 数
---batch-size 8 \ # 批次大小
---no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
---num-gpus 1 # 所需 gpu 数
+python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
```
diff --git a/docs/en/get_started/quick_start.md b/docs/en/get_started/quick_start.md
index caba742c..9e89e4df 100644
--- a/docs/en/get_started/quick_start.md
+++ b/docs/en/get_started/quick_start.md
@@ -87,7 +87,7 @@ python run.py --datasets siqa_gen winograd_ppl \
Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once.
```{caution}
-`--num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus)
+`--hf-num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus)
```
:::{dropdown} More detailed example
@@ -103,7 +103,7 @@ python run.py --datasets siqa_gen winograd_ppl \
--max-out-len 100 \ # Maximum number of tokens to generate
--min-out-len 100 \ # Minimum number of tokens to generate
--batch-size 64 \ # Batch size
---num-gpus 1 # Number of GPUs required to run the model
+--hf-num-gpus 1 # Number of GPUs required to run the model
```
```{seealso}
For all HuggingFace related parameters supported by `run.py`, please read [Launching Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task).
diff --git a/docs/en/user_guides/experimentation.md b/docs/en/user_guides/experimentation.md
index 96b3cca3..0f5575a0 100644
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@@ -25,15 +25,7 @@ Task Configuration (`$EXP`):
- For HuggingFace related models, users can also define a model quickly in the command line through HuggingFace parameters and then specify datasets using `--datasets DATASET1 DATASET2 ...`.
```bash
- python run.py --datasets siqa_gen winograd_ppl \
- --hf-path huggyllama/llama-7b \ # HuggingFace model path
- --model-kwargs device_map='auto' \ # Parameters for constructing the model
- --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Parameters for constructing the tokenizer
- --max-out-len 100 \ # Maximum sequence length the model can accept
- --max-seq-len 2048 \ # Maximum generated token count
- --batch-size 8 \ # Batch size
- --no-batch-padding \ # Disable batch padding and infer through a for loop to avoid accuracy loss
- --num-gpus 1 # Number of minimum required GPUs for this model
+ python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path huggyllama/llama-7b
```
Complete HuggingFace parameter descriptions:
@@ -45,9 +37,8 @@ Task Configuration (`$EXP`):
- `--tokenizer-kwargs`: Parameters for constructing the tokenizer
- `--max-out-len`: Maximum generated token count
- `--max-seq-len`: Maximum sequence length the model can accept
- - `--no-batch-padding`: Disable batch padding and infer through a for loop to avoid accuracy loss
- `--batch-size`: Batch size
- - `--num-gpus`: Number of GPUs required to run the model. Please note that this parameter is only used to determine the number of GPUs required to run the model, and does not affect the actual number of GPUs used for the task. Refer to [Efficient Evaluation](./evaluation.md) for more details.
+ - `--hf-num-gpus`: Number of GPUs required to run the model. Please note that this parameter is only used to determine the number of GPUs required to run the model, and does not affect the actual number of GPUs used for the task. Refer to [Efficient Evaluation](./evaluation.md) for more details.
Starting Methods:
diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md
index 5cc6cfbb..44e0f9ea 100644
--- a/docs/zh_cn/get_started/faq.md
+++ b/docs/zh_cn/get_started/faq.md
@@ -37,9 +37,9 @@ OpenCompass 使用称为 task (任务) 的单位处理评估请求。每个任
供给侧就是运行多少任务。任务是模型和数据集的组合,它首先取决于要测多少模型和多少数据集。另外由于 OpenCompass 会将一个较大的任务拆分成多个小任务,因此每个子任务有多少条数据 `--max-partition-size` 也会影响任务的数量。(`--max-partition-size` 与真实数据条目成正比,但并不是 1:1 的关系)。
-需求侧就是有多少 worker 在运行。由于 OpenCompass 会同时实例化多个模型去进行推理,因此我们用 `--num-gpus` 来指定每个实例使用多少 GPU。注意 `--num-gpus` 是一个 HuggingFace 模型专用的参数,非 HuggingFace 模型设置该参数是不会起作用的。同时我们使用 `--max-num-workers` 去表示最多有多少个实例在运行。最后由于 GPU 显存、负载不充分等问题,OpenCompass 也支持在同一个 GPU 上运行多个实例,这个参数是 `--max-num-workers-per-gpu`。因此可以笼统地认为,我们总共会使用 `--num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` 个 GPU。
+需求侧就是有多少 worker 在运行。由于 OpenCompass 会同时实例化多个模型去进行推理,因此我们用 `--hf-num-gpus` 来指定每个实例使用多少 GPU。注意 `--hf-num-gpus` 是一个 HuggingFace 模型专用的参数,非 HuggingFace 模型设置该参数是不会起作用的。同时我们使用 `--max-num-workers` 去表示最多有多少个实例在运行。最后由于 GPU 显存、负载不充分等问题,OpenCompass 也支持在同一个 GPU 上运行多个实例,这个参数是 `--max-num-workers-per-gpu`。因此可以笼统地认为,我们总共会使用 `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` 个 GPU。
-综上,当任务运行较慢,GPU 负载不高的时候,我们首先需要检查供给是否充足,如果不充足,可以考虑调小 `--max-partition-size` 来将任务拆分地更细;其次需要检查需求是否充足,如果不充足,可以考虑增大 `--max-num-workers` 和 `--max-num-workers-per-gpu`。一般来说,**我们会将 `--num-gpus` 设定为最小的满足需求的值,并不会再进行调整**。
+综上,当任务运行较慢,GPU 负载不高的时候,我们首先需要检查供给是否充足,如果不充足,可以考虑调小 `--max-partition-size` 来将任务拆分地更细;其次需要检查需求是否充足,如果不充足,可以考虑增大 `--max-num-workers` 和 `--max-num-workers-per-gpu`。一般来说,**我们会将 `--hf-num-gpus` 设定为最小的满足需求的值,并不会再进行调整**。
### 我如何控制 OpenCompass 占用的 GPU 数量?
@@ -114,17 +114,8 @@ OpenCompass 中的每个任务代表等待评估的特定模型和数据集部
### 如何使用本地已下好的 Huggingface 模型?
-如果您已经提前下载好 Huggingface 的模型文件,请手动指定模型路径,并在`--model-kwargs` 和 `--tokenizer-kwargs`中添加 `trust_remote_code=True`. 示例如下
+如果您已经提前下载好 Huggingface 的模型文件,请手动指定模型路径. 示例如下
```bash
-python run.py --datasets siqa_gen winograd_ppl \
---hf-path /path/to/model \ # HuggingFace 模型地址
---tokenizer-path /path/to/model \ # HuggingFace 模型地址
---model-kwargs device_map='auto' trust_remote_code=True \ # 构造 model 的参数
---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False trust_remote_code=True \ # 构造 tokenizer 的参数
---max-out-len 100 \ # 模型能接受的最大序列长度
---max-seq-len 2048 \ # 最长生成 token 数
---batch-size 8 \ # 批次大小
---no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
---num-gpus 1 # 所需 gpu 数
+python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
```
diff --git a/docs/zh_cn/get_started/quick_start.md b/docs/zh_cn/get_started/quick_start.md
index 2e092b16..a624552d 100644
--- a/docs/zh_cn/get_started/quick_start.md
+++ b/docs/zh_cn/get_started/quick_start.md
@@ -86,7 +86,7 @@ python run.py --datasets siqa_gen winograd_ppl \
请注意,通过这种方式,OpenCompass 一次只评估一个模型,而其他方式可以一次评估多个模型。
```{caution}
-`--num-gpus` 不代表实际用于评估的 GPU 数量,而是该模型所需的最少 GPU 数量。[更多](faq.md#opencompass-如何分配-gpu)
+`--hf-num-gpus` 不代表实际用于评估的 GPU 数量,而是该模型所需的最少 GPU 数量。[更多](faq.md#opencompass-如何分配-gpu)
```
@@ -104,7 +104,7 @@ python run.py --datasets siqa_gen winograd_ppl \
--max-out-len 100 \ # 生成的最大 token 数
--min-out-len 100 \ # 生成的最小 token 数
--batch-size 64 \ # 批量大小
---num-gpus 1 # 运行模型所需的 GPU 数量
+--hf-num-gpus 1 # 运行模型所需的 GPU 数量
```
```{seealso}
有关 `run.py` 支持的所有与 HuggingFace 相关的参数,请阅读 [评测任务发起](../user_guides/experimentation.md#评测任务发起)
diff --git a/docs/zh_cn/user_guides/experimentation.md b/docs/zh_cn/user_guides/experimentation.md
index 5b781f27..f7b234f7 100644
--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@@ -25,15 +25,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- 对于 HuggingFace 相关模型,用户也可以通过 HuggingFace 参数快速在命令行中定义一个模型,再通过 `--datasets DATASET1 DATASET2 ...` 定义数据集。
```bash
- python run.py --datasets siqa_gen winograd_ppl \
- --hf-path huggyllama/llama-7b \ # HuggingFace 模型地址
- --model-kwargs device_map='auto' \ # 构造 model 的参数
- --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # 构造 tokenizer 的参数
- --max-out-len 100 \ # 模型能接受的最大序列长度
- --max-seq-len 2048 \ # 最长生成 token 数
- --batch-size 8 \ # 批次大小
- --no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
- --num-gpus 1 # 所需 gpu 数
+ python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path huggyllama/llama-7b
```
HuggingFace 全量参数介绍如下:
@@ -45,9 +37,8 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- `--tokenizer-kwargs`: 构造 tokenizer 的参数
- `--max-out-len`: 最长生成 token 数
- `--max-seq-len`: 模型能接受的最大序列长度
- - `--no-batch-padding`: 不打开 batch padding,通过 for loop 推理,避免精度损失
- `--batch-size`: 批次大小
- - `--num-gpus`: 运行模型所需的gpu数
+ - `--hf-num-gpus`: 运行模型所需的gpu数
启动方式:
diff --git a/opencompass/__init__.py b/opencompass/__init__.py
index a6587aea..13a85f77 100644
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
@@ -1 +1 @@
-__version__ = '0.2.4'
+__version__ = '0.2.5'
diff --git a/opencompass/cli/__init__.py b/opencompass/cli/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index 7defe785..f0aaa5f4 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -186,7 +186,8 @@ def parse_hf_args(hf_parser):
hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
- hf_parser.add_argument('--num-gpus', type=int, default=1, help='The number of GPUs for **the HuggingFace model passed via cli**')
+ hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead')
+ hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli')
hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
@@ -205,6 +206,12 @@ def parse_custom_dataset_args(custom_dataset_parser):
def main():
args = parse_args()
+
+ if args.num_gpus is not None:
+ raise ValueError('The `--num-gpus` argument is deprecated, please use '
+ '`--hf-num-gpus` to describe number of gpus used for '
+ 'the HuggingFace model instead.')
+
if args.dry_run:
args.debug = True
# initialize logger
diff --git a/opencompass/datasets/IFEval/__init__.py b/opencompass/datasets/IFEval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index c0de376e..f89dfc0a 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -12,6 +12,7 @@ from .bustum import * # noqa: F401, F403
from .c3 import * # noqa: F401, F403
from .cb import * # noqa: F401, F403
from .ceval import * # noqa: F401, F403
+from .charm import * # noqa: F401, F403
from .chembench import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
diff --git a/opencompass/datasets/charm.py b/opencompass/datasets/charm.py
new file mode 100644
index 00000000..474eb139
--- /dev/null
+++ b/opencompass/datasets/charm.py
@@ -0,0 +1,55 @@
+import json
+import os.path as osp
+import re
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+ TEXT_POSTPROCESSORS)
+
+from .base import BaseDataset
+
+
+@TEXT_POSTPROCESSORS.register_module('charm-reason')
+def charm_reason_postprocess(text: str) -> str:
+ ans = text
+ ans_line = ans.split('answer is ')
+ if len(ans_line) != 1:
+ ans = ans_line[1].strip()
+ match = re.search(r'\(([A-Z])\)*', ans)
+ if match:
+ return match.group(1)
+ match = re.search(r'([A-Z])', ans)
+ if match:
+ return match.group(1)
+ return ans
+
+
+@ICL_EVALUATORS.register_module()
+class CharmReasonEvaluator(BaseEvaluator):
+
+ def score(self, predictions, references):
+ if len(predictions) != len(references):
+ return {'error': 'preds and refrs have different length'}
+ details = []
+ cnt = 0
+ for pred, ref in zip(predictions, references):
+ detail = {'pred': pred, 'answer': ref, 'correct': False}
+ if pred == ref:
+ cnt += 1
+ detail['correct'] = True
+ details.append(detail)
+ score = cnt / len(predictions) * 100
+ return {'score': score, 'details': details}
+
+
+@LOAD_DATASET.register_module()
+class CharmDataset(BaseDataset):
+
+ @staticmethod
+ def load(path: str, name: str):
+ with open(osp.join(path, f'{name}.json'), 'r') as f:
+ data = json.load(f)['examples']
+ dataset = Dataset.from_list(data)
+ return dataset
diff --git a/opencompass/datasets/lawbench/evaluation_functions/__init__.py b/opencompass/datasets/lawbench/evaluation_functions/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/opencompass/datasets/lawbench/evaluation_functions/__init__.py
@@ -0,0 +1 @@
+
diff --git a/opencompass/datasets/lawbench/evaluation_functions/wsjd.py b/opencompass/datasets/lawbench/evaluation_functions/wsjd.py
index 231ea77e..4c0a6dad 100644
--- a/opencompass/datasets/lawbench/evaluation_functions/wsjd.py
+++ b/opencompass/datasets/lawbench/evaluation_functions/wsjd.py
@@ -1,6 +1,7 @@
import re
import os
import subprocess
+import tempfile
"""
Task: legal document grammar correction
@@ -30,21 +31,22 @@ def compute_wsjd(data_dict):
now_path = os.path.abspath(os.getcwd())
utils_path = os.path.abspath(os.path.join(__file__, '..', '..', 'utils'))
- uid = os.getuid()
os.chdir(utils_path)
- with open(f'/tmp/tmp_pred_{uid}.para', 'w') as f:
- f.writelines(preds)
- with open(f'/tmp/tmp_gold_{uid}.para', 'w') as f:
- f.writelines(golds)
- os.environ['KMP_DUPLICATE_LIB_OK']='True'
- os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_pred_{uid}.para -o /tmp/tmp_pred_{uid}.para.m2 -g char')
- os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_gold_{uid}.para -o /tmp/tmp_gold_{uid}.para.m2 -g char')
- output = subprocess.check_output(f"python3 compare_m2_for_evaluation.py -hyp /tmp/tmp_pred_{uid}.para.m2 -ref /tmp/tmp_gold_{uid}.para.m2", shell = True)
+ with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_pred_file, \
+ tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_gold_file:
+ tmp_pred_file.writelines(preds)
+ tmp_gold_file.writelines(golds)
+
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+ os.system(f'python3 parallel_to_m2.py -f {tmp_pred_file.name} -o {tmp_pred_file.name}.m2 -g char')
+ os.system(f'python3 parallel_to_m2.py -f {tmp_gold_file.name} -o {tmp_gold_file.name}.m2 -g char')
+ output = subprocess.check_output(
+ f"python3 compare_m2_for_evaluation.py -hyp {tmp_pred_file.name}.m2 -ref {tmp_gold_file.name}.m2", shell=True)
score = float(output.decode().split('\t')[-1].split('\n')[0])
#remove prediction files
- os.remove(f'/tmp/tmp_pred_{uid}.para')
- os.remove(f'/tmp/tmp_gold_{uid}.para')
- os.remove(f'/tmp/tmp_pred_{uid}.para.m2')
- os.remove(f'/tmp/tmp_gold_{uid}.para.m2')
+ os.remove(tmp_pred_file.name)
+ os.remove(tmp_gold_file.name)
+ os.remove(f"{tmp_pred_file.name}.m2")
+ os.remove(f"{tmp_gold_file.name}.m2")
os.chdir(now_path)
return {"score": score}
diff --git a/opencompass/datasets/lawbench/utils/__init__.py b/opencompass/datasets/lawbench/utils/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/opencompass/datasets/lawbench/utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/opencompass/datasets/lawbench/utils/modules/__init__.py b/opencompass/datasets/lawbench/utils/modules/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/opencompass/datasets/lawbench/utils/modules/__init__.py
@@ -0,0 +1 @@
+
diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py
index 14579405..35ea6d30 100644
--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@@ -208,7 +208,8 @@ class MBPPEvaluator(BaseEvaluator):
assert self.metric in ['MBPP', 'MBPPPlus']
def score(self, predictions, references):
- assert len(predictions) == len(references)
+ if len(predictions) != len(references):
+ return {'error': 'preds and refrs have different length'}
if self.metric == 'MBPP':
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
@@ -286,6 +287,8 @@ class MBPPEvaluator(BaseEvaluator):
r'BEGIN\s*(.*)\s*DONE',
r'```python\s*(.*)\s*```',
r'```\s*(.*)\s*```',
+ r'```python\s*(.*)\s*$',
+ r'```\s*(.*)\s*$',
r'(.*)\s*```.*',
r"\[BEGIN\]\s*'(.*)",
r'\[BEGIN\](.*)',
diff --git a/opencompass/datasets/needlebench/__init__.py b/opencompass/datasets/needlebench/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py
index 1c4de400..5219700c 100644
--- a/opencompass/datasets/subjective/__init__.py
+++ b/opencompass/datasets/subjective/__init__.py
@@ -1,9 +1,12 @@
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .compass_arena import CompassArenaDataset # noqa: F401, F403
+from .compassbench import CompassBenchDataset # noqa: F401, F403
from .corev2 import Corev2Dataset # noqa: F401, F403
from .creationbench import CreationBenchDataset # noqa: F401, F403
+from .fofo import FofoDataset # noqa: F401, F403
from .information_retrival import IRDataset # noqa: F401, F403
from .mtbench import MTBenchDataset # noqa: F401, F403
+from .mtbench101 import MTBench101Dataset # noqa: F401, F403
from .multiround import MultiroundDataset # noqa: F401, F403
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
diff --git a/opencompass/datasets/subjective/compassbench.py b/opencompass/datasets/subjective/compassbench.py
new file mode 100644
index 00000000..fba999cf
--- /dev/null
+++ b/opencompass/datasets/subjective/compassbench.py
@@ -0,0 +1,102 @@
+# flake8: noqa
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。
+[用户问题]
+{question}
+
+[回答1开始]
+{prediction}
+[回答1结束]
+
+[回答2开始]
+{prediction2}
+[回答2结束]
+
+请先对两个回答进行评价,最后在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+
+如果你认为回答1更好,你的输出应形如:
+评价1:回答1 xxx
+评价2:回答2 xxx
+选择:[[A]]
+
+如果你认为回答2更好,你的输出应形如:
+评价1:回答1 xxx
+评价2:回答2 xxx
+选择:[[B]]
+
+如果你认为回答1、2打成平手,你的输出应形如:
+评价1:回答1 xxx
+评价2:回答2 xxx
+选择:[[C]]
+"""
+
+base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
+A. Response 1 is better
+B. Response 2 is better
+C. Both responses are equal
+
+[user's question]
+{question}
+
+[Response 1 Start]
+{prediction}
+[Response 1 End]
+
+[Response 2 Start]
+{prediction2}
+[Response 2 End]
+
+If you believe that Response 1 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[A]]
+
+If you believe that Response 2 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[B]]
+
+If you believe that both responses are equally good, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[C]]
+"""
+
+
+@LOAD_DATASET.register_module()
+class CompassBenchDataset(BaseDataset):
+
+ def load(self, path: str, name: str):
+ filename = osp.join(path, f'{name}.json')
+ raw_data = []
+ with open(filename, 'r', encoding='utf-8') as f:
+ json_data = json.load(f)
+ for problem in json_data:
+ question = problem['question']
+ lan = problem['language']
+ others = problem['others']
+ judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
+ judge_prompt = judge_prompt.replace('{question}', question)
+ raw_data.append({
+ 'question': question,
+ 'judge_prompt': judge_prompt,
+ 'judge': {
+ 'lan': lan,
+ 'level': others['level'],
+ 'category': problem['category'],
+ 'question': question
+ }
+ })
+ dataset = Dataset.from_list(raw_data)
+ return dataset
diff --git a/opencompass/datasets/subjective/fofo.py b/opencompass/datasets/subjective/fofo.py
new file mode 100644
index 00000000..ad92fffa
--- /dev/null
+++ b/opencompass/datasets/subjective/fofo.py
@@ -0,0 +1,36 @@
+# flake8: noqa
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class FofoDataset(BaseDataset):
+
+ def load(self, path: str, name: str):
+ filename = osp.join(path, f'{name}.json')
+ raw_data = []
+ with open(filename, 'r', encoding='utf-8') as f:
+ json_data = json.load(f)
+ for problem in json_data:
+ question = problem['instruction']
+ lan = 'cn' if 'cn' in name else 'en'
+ raw_data.append({
+ 'question': question,
+ 'judge': {
+ 'lan': lan,
+ 'id': problem['id'],
+ 'domain': problem['domain'],
+ 'sub_domain': problem['sub_domain'],
+ 'format': problem['format'],
+ 'format_type': problem['format_type'],
+ 'question': question
+ }
+ })
+ dataset = Dataset.from_list(raw_data)
+ return dataset
diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py
new file mode 100644
index 00000000..70d1caed
--- /dev/null
+++ b/opencompass/datasets/subjective/mtbench101.py
@@ -0,0 +1,327 @@
+# flake8: noqa: E501
+import json
+import os.path as osp
+import re
+from typing import Optional
+
+from datasets import Dataset, DatasetDict
+from torch.utils.data import DataLoader, Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+skip_first_tasks = ['FR', 'CR', 'AR', 'SA', 'SC', 'CM']
+
+need_ref_tasks = ['MR', 'GR']
+
+judge = "Please act as an impartial judge follow this instructions: In the following conversations, only the response of the 'assistant' in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated. Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria.\
+ Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations is the groud truth history which do NOT need to be evaluated."
+
+score_format = "\n\n Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated!! You must provide your explanation. After providing your explanation, please show the score by strictly following this format: 'Rating: [[score]]', for example 'Rating: [[6]]'. The DIALGUE need to be judged is in this format: \n *** \n DIALGUE \n ***"
+
+eval_CM = "The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows:\n\
+\n\
+1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments.\n\
+2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue.\n\
+3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation.\n\
+4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements.\n\
+Scoring Guidelines:\n\
+\n\
+1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow.\n\
+4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context.\n\
+7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow.\n\
+10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally.\n\
+When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance.\n\
+\n\
+Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness."
+
+eval_SI = "\n We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows:\
+\n \
+1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task.\
+2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements.\
+\n \
+Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines:\
+\n \
+- 1-3 points: The AI assistant failed to understand the ta///sk request and neither asked relevant questions nor provided information related to the task.\
+- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant.\
+- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights.\
+- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations.\
+\n \
+Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary."
+
+eval_CR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \
+\n \
+1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \
+2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \
+\n \
+Scoring Guidelines:\n \
+\n \
+- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \
+- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \
+- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \
+- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \
+\n \
+Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly."
+
+eval_FR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \
+\n \
+1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \
+2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \
+\n \
+Scoring Guidelines:\n \
+\n \
+- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \
+- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \
+- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \
+- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \
+\n \
+Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly."
+
+eval_CC = "\nThe AI assistant's capability to resist interference will be measured against these criteria:\
+\n\n \
+1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information.\n\
+2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only.\
+\n\n \
+Scoring Guidelines:\
+\n\n \
+- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information.\n\
+- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question.\n\
+- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions.\n\
+- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant.\
+\n\n \
+Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria. "
+
+eval_TS = "\nThe AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows:\n \
+\n \
+1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject.\n \
+2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds.\n \
+3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature.\n \
+4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads.\n \
+Scoring Guidelines:\n \
+\n \
+1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter.\n \
+4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts.\n \
+7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction.\n \
+10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content.\n \
+When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities.\n \
+\n \
+Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria."
+
+eval_AR = "The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance:\n\
+\n \
+1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply.\n\
+2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references.\n\
+\n \
+Scoring Guidelines:\n\
+\n\
+- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content.\n\
+- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information.\n\
+- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue.\n\
+- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely.\n\
+\n \
+In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above. "
+
+eval_IC = "The AI assistant’s ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows:\n \
+\n \
+1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification.\n \
+2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query.\n \
+3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query.\n \
+\n \
+Scoring Guidelines:\n \
+\n \
+- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question.\n \
+- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified.\n \
+- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question.\n \
+- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question.\n \
+\n \
+When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue.\n \
+\n \
+Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query."
+
+eval_SC = "We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows:\n\
+\n\
+1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation.\n\
+2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response.\n\
+\n\
+Scoring Guidelines:\n\
+\n\
+- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response.\n\
+- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response.\n\
+- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer.\n\
+- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response.\n\
+\n\
+In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above."
+
+eval_SA = "We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows:\n\
+\n\
+1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge.\n\
+2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance.\n\
+\n\
+Scoring Guidelines:\n\
+\n\
+- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect.\n\
+- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer.\n\
+- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification.\n\
+- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge.\n\
+\n\
+In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above."
+
+eval_PI = "The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows:\n\
+\n\
+1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics.\n\
+2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts.\n\
+3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it.\n\
+4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'.\n\
+Scoring Guidelines:\n\
+\n\
+1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation.\n\
+4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'.\n\
+7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'.\n\
+10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation.\n\
+When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score.\n\
+\n\
+Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience."
+
+eval_MR = "The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows:\n\
+\n\
+1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem.\n\
+2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles.\n\
+3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\
+4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\
+Scoring Guidelines:\n\
+\n\
+1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards.\n\
+4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles.\n\
+7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\
+10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding.\n\
+When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\
+\n\
+Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution."
+
+eval_GR = "The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows:\n\
+\n\
+1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format ‘### reference solution ###‘ for the specific problem.\n\
+2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning.\n\
+3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\
+4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\
+Scoring Guidelines:\n\
+\n\
+1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning.\n\
+4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles.\n\
+7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\
+10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding.\n\
+When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\
+\n\
+Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution."
+
+unique_prompt = {
+ 'CM': eval_CM,
+ 'SI': eval_SI,
+ 'AR': eval_AR,
+ 'TS': eval_TS,
+ 'CC': eval_CC,
+ 'CR': eval_CR,
+ 'FR': eval_FR,
+ 'SC': eval_SC,
+ 'SA': eval_SA,
+ 'MR': eval_MR,
+ 'GR': eval_GR,
+ 'IC': eval_IC,
+ 'PI': eval_PI,
+}
+
+
+def eval_prompt_construct(task, ref_answer, history):
+
+ if task in need_ref_tasks:
+ system_prompt = judge + unique_prompt[task] + score_format
+ prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***\n\n\
+ The reference solution is: \n ### \n {ref_answer} \n ###\n\n'.format(
+ history=history, prediction='{prediction}', ref_answer=ref_answer)
+
+ else:
+ system_prompt = judge + unique_prompt[task] + score_format
+ prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***'.format(
+ history=history, prediction='{prediction}')
+
+ return system_prompt, prompt_template
+
+
+def add_format(question, answer):
+ history = [dict(role='user', content=question)]
+ if answer:
+ history += [dict(role='assistant', content=answer)]
+ return history
+
+
+@LOAD_DATASET.register_module()
+class MTBench101Dataset(BaseDataset):
+
+ def load(self, path: str, name: str):
+ import copy
+
+ filename = osp.join(path, f'{name}.jsonl')
+ # filename = osp.join(path, 'mtbench101.jsonl')
+ dataset = DatasetDict()
+ raw_data = []
+
+ lines = open(filename, 'r', encoding='utf-8').readlines()
+ conversations = []
+ for line in lines:
+ line = json.loads(line)
+ conversations.append(line)
+
+ for dialogue in conversations:
+ multi_id = dialogue['id']
+ task = dialogue['task']
+ if task in skip_first_tasks:
+ skip_first = True
+ else:
+ skip_first = False
+
+ current_multi_id = None
+ pre_dia = []
+ history = ''
+ dia_list = []
+ for turn_index, turn in enumerate(dialogue['history']):
+ human = turn['user']
+ assistant = turn['bot']
+ turn_id = str(turn_index + 1)
+
+ if current_multi_id is not None and multi_id != current_multi_id:
+ pre_dia = []
+ history = ''
+
+ current_multi_id = multi_id
+
+ if skip_first and turn_index == 0:
+ pre_dia = add_format(question=human, answer=assistant)
+ history = '\n\n Human: ' + human + '\n\nAssistant: ' + assistant
+ continue
+
+ history = history + '\n\n Human: ' + human + '\n\nAssistant: '
+ pre_dia += add_format(question=human, answer=assistant)
+
+ pre_dia_copy = copy.deepcopy(pre_dia)
+
+ system_prompt, prompt_template = eval_prompt_construct(
+ task, pre_dia, history)
+
+ raw_data.append({
+ 'dialogue': pre_dia_copy,
+ 'task': task,
+ 'multi_id': current_multi_id,
+ 'turn_id': turn_id,
+ 'system_prompt': system_prompt,
+ 'prompt_template': prompt_template,
+ 'judge': {
+ 'task': task,
+ 'multi_id': current_multi_id,
+ 'turn_id': turn_id,
+ }
+ })
+ history = history + assistant
+
+ dataset = Dataset.from_list(raw_data)
+ return dataset
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index 4b35b160..f41ff5d3 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -18,7 +18,7 @@ from .huggingface_above_v4_33 import HuggingFacewithChatTemplate # noqa: F401
from .hunyuan_api import Hunyuan # noqa: F401
from .intern_model import InternLM # noqa: F401
from .krgpt_api import KrGPT # noqa: F401
-from .lightllm_api import LightllmAPI # noqa: F401
+from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
from .lmdeploy_tis import LmdeployTisModel # noqa: F401
diff --git a/opencompass/models/huggingface_above_v4_33.py b/opencompass/models/huggingface_above_v4_33.py
index 1f12d84f..ae13ae10 100644
--- a/opencompass/models/huggingface_above_v4_33.py
+++ b/opencompass/models/huggingface_above_v4_33.py
@@ -84,7 +84,6 @@ def _convert_chat_messages(inputs, merge_role=True):
messages = merged_messages
outputs.append(messages)
- print(messages)
return outputs
diff --git a/opencompass/models/lightllm_api.py b/opencompass/models/lightllm_api.py
index b0062525..2ac04732 100644
--- a/opencompass/models/lightllm_api.py
+++ b/opencompass/models/lightllm_api.py
@@ -1,16 +1,20 @@
import json
import re
+import time
from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
import numpy as np
import requests
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
from .base import BaseModel
-from .base_api import TokenBucket
+from .base_api import BaseAPIModel, TokenBucket
+
+PromptType = Union[PromptList, str]
@MODELS.register_module()
@@ -189,3 +193,158 @@ class LightllmAPI(BaseModel):
chinese_count = sum(len(part) for part in chinese_parts)
return english_count + chinese_count
+
+
+class LightllmChatAPI(BaseAPIModel):
+ """Model wrapper around YiAPI.
+
+ Documentation:
+
+ Args:
+ path (str): The name of YiAPI model.
+ e.g. `moonshot-v1-32k`
+ key (str): Authorization key.
+ query_per_second (int): The maximum queries allowed per second
+ between two consecutive calls of the API. Defaults to 1.
+ max_seq_len (int): Unused here.
+ meta_template (Dict, optional): The model's meta prompt
+ template if needed, in case the requirement of injecting or
+ wrapping of any meta instructions.
+ retry (int): Number of retires if the API call fails. Defaults to 2.
+ """
+
+ def __init__(
+ self,
+ path: str,
+ url: str,
+ query_per_second: int = 2,
+ max_seq_len: int = 2048,
+ meta_template: Optional[Dict] = None,
+ retry: int = 2,
+ ):
+ super().__init__(path=path,
+ max_seq_len=max_seq_len,
+ query_per_second=query_per_second,
+ meta_template=meta_template,
+ retry=retry)
+ self.url = url
+ self.model = path
+
+ def generate(
+ self,
+ inputs: List[PromptType],
+ max_out_len: int = 512,
+ ) -> List[str]:
+ """Generate results given a list of inputs.
+
+ Args:
+ inputs (List[PromptType]): A list of strings or PromptDicts.
+ The PromptDict should be organized in OpenCompass'
+ API format.
+ max_out_len (int): The maximum length of the output.
+
+ Returns:
+ List[str]: A list of generated strings.
+ """
+ with ThreadPoolExecutor() as executor:
+ results = list(
+ executor.map(self._generate, inputs,
+ [max_out_len] * len(inputs)))
+ self.flush()
+ return results
+
+ def _generate(
+ self,
+ input: PromptType,
+ max_out_len: int = 512,
+ ) -> str:
+ """Generate results given an input.
+
+ Args:
+ inputs (PromptType): A string or PromptDict.
+ The PromptDict should be organized in OpenCompass'
+ API format.
+ max_out_len (int): The maximum length of the output.
+
+ Returns:
+ str: The generated string.
+ """
+ assert isinstance(input, (str, PromptList))
+
+ if isinstance(input, str):
+ messages = [{'role': 'user', 'content': input}]
+ else:
+ messages = []
+ msg_buffer, last_role = [], None
+ for item in input:
+ item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+ if item['role'] != last_role and last_role is not None:
+ messages.append({
+ 'content': '\n'.join(msg_buffer),
+ 'role': last_role
+ })
+ msg_buffer = []
+ msg_buffer.append(item['prompt'])
+ last_role = item['role']
+ messages.append({
+ 'content': '\n'.join(msg_buffer),
+ 'role': last_role
+ })
+
+ data = {'messages': messages}
+
+ max_num_retries = 0
+ while max_num_retries < self.retry:
+ self.acquire()
+ try:
+ raw_response = requests.request('POST',
+ url=self.url,
+ json=data)
+ except Exception as err:
+ print('Request Error:{}'.format(err))
+ time.sleep(2)
+ continue
+
+ try:
+ response = raw_response.json()
+ except Exception as err:
+ print('Response Error:{}'.format(err))
+ response = None
+ self.release()
+
+ if response is None:
+ print('Connection error, reconnect.')
+ # if connect error, frequent requests will casuse
+ # continuous unstable network, therefore wait here
+ # to slow down the request
+ self.wait()
+ continue
+
+ if raw_response.status_code == 200:
+ # msg = json.load(response.text)
+ # response
+ msg = response['choices'][0]['message']['content']
+ self.logger.debug(f'Generated: {msg}')
+ return msg
+
+ if raw_response.status_code == 401:
+ print('请求被拒绝 api_key错误')
+ continue
+ elif raw_response.status_code == 400:
+ print(messages, response)
+ print('请求失败,状态码:', raw_response)
+ msg = 'The request was rejected because high risk'
+ return msg
+ elif raw_response.status_code == 429:
+ print(messages, response)
+ print('请求失败,状态码:', raw_response)
+ time.sleep(5)
+ continue
+ else:
+ print(messages, response)
+ print('请求失败,状态码:', raw_response)
+ time.sleep(1)
+
+ max_num_retries += 1
+
+ raise RuntimeError(raw_response)
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 47a942df..5497f4c2 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -50,6 +50,9 @@ class OpenAI(BaseAPIModel):
temperature (float, optional): What sampling temperature to use.
If not None, will override the temperature in the `generate()`
call. Defaults to None.
+ tokenizer_path (str, optional): The path to the tokenizer. Use path if
+ 'tokenizer_path' is None, otherwise use the 'tokenizer_path'.
+ Defaults to None.
"""
is_api: bool = True
@@ -67,7 +70,8 @@ class OpenAI(BaseAPIModel):
mode: str = 'none',
logprobs: Optional[bool] = False,
top_logprobs: Optional[int] = None,
- temperature: Optional[float] = None):
+ temperature: Optional[float] = None,
+ tokenizer_path: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
@@ -82,6 +86,7 @@ class OpenAI(BaseAPIModel):
self.mode = mode
self.logprobs = logprobs
self.top_logprobs = top_logprobs
+ self.tokenizer_path = tokenizer_path
if isinstance(key, str):
if key == 'ENV':
@@ -184,8 +189,12 @@ class OpenAI(BaseAPIModel):
messages.append(msg)
# Hold out 100 tokens due to potential errors in tiktoken calculation
- max_out_len = min(
- max_out_len, context_window - self.get_token_len(str(input)) - 100)
+ try:
+ max_out_len = min(
+ max_out_len,
+ context_window - self.get_token_len(str(input)) - 100)
+ except KeyError:
+ max_out_len = max_out_len
if max_out_len <= 0:
return ''
@@ -283,7 +292,8 @@ class OpenAI(BaseAPIModel):
Returns:
int: Length of the input tokens
"""
- enc = self.tiktoken.encoding_for_model(self.path)
+ enc = self.tiktoken.encoding_for_model(self.path
+ or self.tokenizer_path)
return len(enc.encode(prompt))
def bin_trim(self, prompt: str, num_token: int) -> str:
@@ -329,140 +339,3 @@ class OpenAI(BaseAPIModel):
elif self.mode == 'rear':
prompt = sep.join(words[:l])
return prompt
-
-
-class OpenAIAllesAPIN(OpenAI):
- """Model wrapper around OpenAI-AllesAPIN.
-
- Args:
- path (str): The name of OpenAI's model.
- url (str): URL to AllesAPIN.
- key (str): AllesAPIN key.
- query_per_second (int): The maximum queries allowed per second
- between two consecutive calls of the API. Defaults to 1.
- max_seq_len (int): Unused here.
- meta_template (Dict, optional): The model's meta prompt
- template if needed, in case the requirement of injecting or
- wrapping of any meta instructions.
- retry (int): Number of retires if the API call fails. Defaults to 2.
- """
-
- is_api: bool = True
-
- def __init__(self,
- path: str,
- url: str,
- key: str,
- temperature: float = 1.0,
- query_per_second: int = 1,
- rpm_verbose: bool = False,
- max_seq_len: int = 2048,
- meta_template: Optional[Dict] = None,
- retry: int = 2):
- super().__init__(path=path,
- max_seq_len=max_seq_len,
- query_per_second=query_per_second,
- rpm_verbose=rpm_verbose,
- meta_template=meta_template,
- retry=retry)
- self.url = url
- self.temperature = temperature
- self.headers = {
- 'alles-apin-token': key,
- 'content-type': 'application/json',
- }
-
- def _generate(self, input: PromptType, max_out_len: int,
- temperature: float) -> str:
- """Generate results given an input.
-
- Args:
- inputs (PromptType): A string or PromptDict.
- The PromptDict should be organized in OpenCompass'
- API format.
- max_out_len (int): The maximum length of the output.
- temperature (float): What sampling temperature to use,
- between 0 and 2. Higher values like 0.8 will make the output
- more random, while lower values like 0.2 will make it more
- focused and deterministic.
-
- Returns:
- str: The generated string.
- """
- assert isinstance(input, (str, PromptList))
-
- if isinstance(input, str):
- messages = [{'role': 'user', 'content': input}]
- else:
- messages = []
- for item in input:
- msg = {'content': item['prompt']}
- if item['role'] == 'HUMAN':
- msg['role'] = 'user'
- elif item['role'] == 'BOT':
- msg['role'] = 'assistant'
- elif item['role'] == 'SYSTEM':
- msg['role'] = 'system'
- messages.append(msg)
-
- # model can be response with user and system
- # when it comes with agent involved.
- assert msg['role'] in ['user', 'system']
-
- data = {
- 'model': self.path,
- 'messages': messages,
- 'temperature': temperature
- }
- for _ in range(self.retry):
- self.wait()
- try:
- raw_response = requests.post(self.url,
- headers=self.headers,
- data=json.dumps(data))
- except requests.ConnectionError:
- self.logger.error('Request error, got',
- str(raw_response.content))
- time.sleep(1)
- continue
- try:
- response = raw_response.json()
- except requests.JSONDecodeError:
- self.logger.error('JsonDecode error, got',
- str(raw_response.content))
- time.sleep(1)
- continue
- if raw_response.status_code == 200 and response[
- 'msgCode'] == '10000':
- data = response['data']
- choices = data['choices']
- if choices is None:
- self.logger.error(data)
- else:
- return choices[0]['message']['content'].strip()
- try:
- match = re.match(r'Error code: \d+ - (.*)', response['data'])
- err = eval(match.group(1))['error']
- if err['code'] == 'content_filter' and err['status'] == 400:
- return err['message']
- except Exception:
- pass
- self.logger.error(response['msg'])
- self.logger.error(response)
- time.sleep(1)
-
- raise RuntimeError('API call failed.')
-
- def get_token_len(self, prompt: str) -> int:
- """Get lengths of the tokenized string. Only English and Chinese
- characters are counted for now. Users are encouraged to override this
- method if more accurate length is needed.
-
- Args:
- prompt (str): Input string.
-
- Returns:
- int: Length of the input tokens
- """
- enc = self.tiktoken.encoding_for_model(self.path)
- return len(enc.encode(prompt))
diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py
index bd89533c..5d903738 100644
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@@ -215,6 +215,7 @@ class LMEvaluator:
for k, v in pred_dict.items():
dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
dataset.reader.input_columns.append(k)
+
if references:
dataset.reader.input_columns.append('reference')
dataset.reader.dataset['test'] = dataset.test.add_column(
diff --git a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
index 1775ba12..b08a6fab 100644
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
@@ -145,6 +145,8 @@ class PPLInferencerOutputHandler:
def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
+ if 'origin_prompt' not in self.results_dict[str(idx)]:
+ self.results_dict[str(idx)]['origin_prompt'] = input
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
self.results_dict[str(idx)]['label: ' + str(label)] = {}
self.results_dict[str(idx)]['label: ' +
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 0590d223..e568a965 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -161,6 +161,9 @@ class DLCRunner(BaseRunner):
shell_cmd += 'umask 0000; '
shell_cmd += '{task_cmd}'
+ # set priority to 1 as default
+ task_priority = self.aliyun_cfg.get('priority', 1)
+
tmpl = (
'dlc submit pytorchjob'
f" --command '{shell_cmd}'"
@@ -168,6 +171,7 @@ class DLCRunner(BaseRunner):
f" --config {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
f" --resource_id {self.aliyun_cfg['resource_id']}"
+ f' --priority {task_priority}'
' --workers 1'
f' --worker_cpu {max(num_gpus * 8, 12)}'
f' --worker_gpu {num_gpus}'
diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py
index f16b208d..8eeca328 100644
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -226,7 +226,7 @@ class DefaultSummarizer:
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
- def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None):
+ def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None, skip_all_slash=False):
dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}
@@ -257,14 +257,16 @@ class DefaultSummarizer:
table.append(header)
for dataset_abbr, metric in summarizer_dataset_abbrs:
if dataset_abbr not in dataset_metrics:
- table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
+ if not skip_all_slash:
+ table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
if metric is None:
metric = dataset_metrics[dataset_abbr][0]
elif metric in dataset_metrics[dataset_abbr]:
pass
else:
- table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
+ if not skip_all_slash:
+ table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
diff --git a/opencompass/summarizers/multi_faceted.py b/opencompass/summarizers/multi_faceted.py
index 27848ba3..a272947e 100644
--- a/opencompass/summarizers/multi_faceted.py
+++ b/opencompass/summarizers/multi_faceted.py
@@ -1,11 +1,9 @@
# flake8: noqa
# yapf: disable
-import functools
-import getpass
-import math
+import json
import os
from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional
import tabulate
from mmengine import ConfigDict
@@ -33,7 +31,9 @@ class MultiFacetedSummarizer(DefaultSummarizer):
profile_dataset_abbrs = dataset_abbrs_item['dataset_abbrs']
# format table
- table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs)
+ table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs, skip_all_slash=True)
+ if len(table) == 1:
+ continue
# output to screen
print(tabulate.tabulate(table, headers='firstrow', floatfmt='.2f'))
diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py
index 54ed56ad..7457f14f 100644
--- a/opencompass/summarizers/subjective/__init__.py
+++ b/opencompass/summarizers/subjective/__init__.py
@@ -4,9 +4,12 @@ from .all_obj import AllObjSummarizer
from .alpacaeval import AlpacaSummarizer
from .arenahard import ArenaHardSummarizer
from .compass_arena import CompassArenaSummarizer
+from .compassbench import CompassBenchSummarizer
from .corev2 import Corev2Summarizer
from .creationbench import CreationBenchSummarizer
from .flames import FlamesSummarizer
+from .fofo import FofoSummarizer
from .information_retrival import IRSummarizer
from .mtbench import MTBenchSummarizer
+from .mtbench101 import MTBench101Summarizer
from .multiround import MultiroundSummarizer
diff --git a/opencompass/summarizers/subjective/arenahard.py b/opencompass/summarizers/subjective/arenahard.py
index 5958bed9..d82c800f 100644
--- a/opencompass/summarizers/subjective/arenahard.py
+++ b/opencompass/summarizers/subjective/arenahard.py
@@ -133,6 +133,27 @@ def get_win_rate_column(df, column, baseline='gpt4-0314'):
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
+def load_model_preds(filename):
+ root, ext = osp.splitext(filename)
+ partial_filename = root + '_0' + ext
+ if osp.exists(osp.realpath(filename)):
+ preds = mmengine.load(filename)
+ pred_strs = [
+ preds[str(i)]['prediction'] for i in range(len(preds))
+ ]
+ else:
+ filename = partial_filename
+ pred_strs = []
+ i = 1
+ while osp.exists(osp.realpath(filename)):
+ preds = mmengine.load(filename)
+ filename = root + f'_{i}' + ext
+ i += 1
+ pred_strs += [
+ preds[str(i)]['prediction'] for i in range(len(preds))
+ ]
+ return pred_strs
+
def get_battles_from_judgment(dataset, subdir_path, post_process, WEIGHT=3):
arena_hard_battles = pd.DataFrame()
dataset_abbr = dataset_abbr_from_cfg(dataset)
@@ -274,12 +295,12 @@ class ArenaHardSummarizer:
if model == 'gpt4-0314':
stats.at[i, 'avg_tokens'] = 423
else:
- with open(os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json'), 'r') as f:
- model_preds = json.load(f)
- pred_length = 0
- for k, v in model_preds.items():
- pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(v['prediction']))
- pred_length /= len(model_preds)
+ file_name = os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json')
+ model_preds = load_model_preds(file_name)
+ pred_length = 0
+ for model_pred in model_preds:
+ pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(model_pred, disallowed_special=()))
+ pred_length /= len(model_preds)
stats.at[i, 'avg_tokens'] = pred_length
stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()
stats.sort_values(by='model', inplace=True)
diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py
new file mode 100644
index 00000000..7a34c54a
--- /dev/null
+++ b/opencompass/summarizers/subjective/compassbench.py
@@ -0,0 +1,241 @@
+# flake8: noqa
+# yapf: disable
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+from itertools import product
+
+import mmengine
+from mmengine import ConfigDict
+from tabulate import tabulate
+
+from opencompass.partitioners.sub_naive import remove_duplicate_pairs
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+
+def model_abbr_from_cfg_used_in_summarizer(model):
+ if model.get('summarizer_abbr', None):
+ return model['summarizer_abbr']
+ else:
+ return model_abbr_from_cfg(model)
+
+def post_process_compass_arena(s):
+ if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s):
+ return result[0]
+ else:
+ return None
+
+
+def check_position_bias(judged_answers, references, banned_choice=['C']):
+ """Check position bias for judgellm's judgement.
+
+ Args:
+ judged_answers: The successfully extracted judgement.
+ references: The references contains original question, which is used to located the same question for different position judgement.
+ """
+ position_bias_flag = 0
+ position_bias_dict = {}
+ for judge, ref in zip(judged_answers, references):
+ question = ref['question']
+ question_hash = hash(question)
+ if question_hash not in position_bias_dict:
+ position_bias_dict[question_hash] = {
+ 'question': question,
+ 'judge': judge
+ }
+ else:
+ first_judge = position_bias_dict[question_hash]['judge']
+ if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
+ # If second choice is same with first choice, there has position bias.
+ position_bias_flag += 1
+ return position_bias_flag
+
+
+class CompassBenchSummarizer:
+ """Do the subjectivity analyze based on evaluation results.
+
+ Args:
+ config (ConfigDict): The configuration object of the evaluation task.
+ It's expected to be filled out at runtime.
+ """
+
+ def __init__(self,
+ config: ConfigDict,
+ judge_type='general',
+ check_pos_bias=True,
+ summary_type='single') -> None:
+ self.tasks = []
+ self.cfg = config
+ self.base_models = self.cfg['eval']['partitioner']['base_models']
+ self.compare_models = self.cfg['eval']['partitioner']['compare_models']
+ self.judge_models = self.cfg.get('judge_models', None)
+ self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
+ self.judge_type = judge_type
+ assert self.judge_type in ['general']
+ self.judge_map = {'general': post_process_compass_arena}
+ self.judge_function = self.judge_map[self.judge_type]
+ self.check_pos_bias = check_pos_bias
+ self.summary_type = summary_type
+
+ def get_score(self, time_str):
+ output_dir, results_folder = get_outdir(self.cfg, time_str)
+ model_combinations = list(product(self.base_models, self.compare_models))
+ unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
+
+ if self.meta_judge_model is not None:
+ self.judge_models.append(self.meta_judge_model)
+
+ scores = {}
+
+ for idx, judge_model_cfg in enumerate(self.judge_models):
+ judge_model = model_abbr_from_cfg(judge_model_cfg)
+ for dataset in self.cfg['datasets']:
+ dataset_abbr = dataset_abbr_from_cfg(dataset)
+ for model_pair in unique_combinations:
+ model1 = model_pair[0]['abbr']
+ model2 = model_pair[1]['abbr']
+ if idx == len(self.judge_models):
+ subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
+ else:
+ subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
+ subdir_path = os.path.join(results_folder, subdir)
+ if not os.path.isdir(subdir_path):
+ print(subdir_path + ' is not exist! please check!')
+ continue
+ judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
+ if self.check_pos_bias:
+ bias_num = check_position_bias(judged_answers, references)
+ else:
+ bias_num = 0
+ win_model1 = defaultdict(float)
+ win_model2 = defaultdict(float)
+ categories = defaultdict(float)
+ difficulties = defaultdict(float)
+ model1 = references[0]['answer1']
+ model2 = references[0]['answer2']
+ for prediction, reference in zip(judged_answers, references):
+ categories[dataset_abbr] += 1
+ categories[reference['category']] += 1
+ difficulties[reference['level']] += 1
+
+ if prediction == 'A':
+ if reference['answer1'] == model1:
+ score_1, score_2 = 1, 0
+ else:
+ score_1, score_2 = 0, 1
+ elif prediction == 'B':
+ if reference['answer1'] == model1:
+ score_1, score_2 = 0, 1
+ else:
+ score_1, score_2 = 1, 0
+ elif prediction == 'C':
+ if self.summary_type == 'half_add':
+ score_1, score_2 = 0.5, 0.5
+ else:
+ score_1, score_2 = 0, 0
+
+ win_model1[reference['category']] += score_1
+ win_model1[dataset_abbr] += score_1
+ win_model2[reference['category']] += score_2
+ win_model2[dataset_abbr] += score_2
+ for category in categories:
+ win_model1[category] = win_model1[category] / categories[category] * 100
+ win_model1[category] = round(win_model1[category], 2)
+ win_model2[category] = win_model2[category] / categories[category] * 100
+ win_model2[category] = round(win_model2[category], 2)
+ win_model1['position_bias'] = bias_num
+ win_model2['position_bias'] = bias_num
+
+ if judge_model not in scores:
+ scores[judge_model] = {}
+ if dataset_abbr not in scores[judge_model]:
+ scores[judge_model][dataset_abbr] = {}
+ scores[judge_model][dataset_abbr][model2] = win_model2
+
+ return scores
+
+ def summarize(
+ self,
+ time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
+ ):
+ """Summarize the subjectivity analysis based on evaluation results.
+
+ Args:
+ time_str (str): Timestamp for file naming.
+
+ Returns:
+ pd.DataFrame: The summary results.
+ """
+
+
+ scores = self.get_score(time_str)
+ # scores['win_' + model1] = win_model1
+ output_dir, results_folder = get_outdir(self.cfg, time_str)
+
+
+ for idx, judge_model in enumerate(self.judge_models):
+ judge_abbr = model_abbr_from_cfg(judge_model)
+ for dataset in self.cfg['datasets']:
+ dataset_abbr = dataset_abbr_from_cfg(dataset)
+ summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
+ one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
+ row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
+ row_headers = [dataset_abbr, 'position_bias'] + row_headers
+ headers = [''] + summarizer_model_abbrs
+ table = []
+ for row_header in row_headers:
+ row = [row_header]
+ for model_cfg in self.compare_models:
+ model_abbr = model_abbr_from_cfg(model_cfg)
+ s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
+ if isinstance(s, float):
+ s = f'{s:.2f}'
+ if isinstance(s, int):
+ s = str(s)
+ row.append(s)
+ table.append(row)
+ txt = tabulate(table, headers=headers)
+ print(txt)
+
+ if idx == len(self.judge_models):
+ output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
+ else:
+ output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
+
+ with open(output_filename, 'w') as f:
+ f.write(','.join(headers) + '\n')
+ for line in table:
+ f.write(','.join(line) + '\n')
+ print(output_filename)
+
+ table = []
+ summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
+ headers = [''] + summarizer_model_abbrs
+ for dataset in self.cfg['datasets']:
+ dataset_abbr = dataset_abbr_from_cfg(dataset)
+ row = [dataset_abbr]
+ for model_cfg in self.compare_models:
+ model_abbr = model_abbr_from_cfg(model_cfg)
+ s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
+ if isinstance(s, float):
+ s = f'{s:.2f}'
+ if isinstance(s, int):
+ s = str(s)
+ row.append(s)
+ table.append(row)
+ txt = tabulate(table, headers=headers)
+ print(txt)
+
+ if idx == len(self.judge_models):
+ output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
+ else:
+ output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
+ with open(output_filename, 'w') as f:
+ f.write(','.join(headers) + '\n')
+ for line in table:
+ f.write(','.join(line) + '\n')
+ print(output_filename)
diff --git a/opencompass/summarizers/subjective/fofo.py b/opencompass/summarizers/subjective/fofo.py
new file mode 100644
index 00000000..a00f622d
--- /dev/null
+++ b/opencompass/summarizers/subjective/fofo.py
@@ -0,0 +1,154 @@
+# flake8: noqa: E501
+import csv
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+
+import numpy as np
+from mmengine import ConfigDict
+from tabulate import tabulate
+
+try:
+ from prettytable import from_csv
+except ImportError:
+ from_csv = None
+
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+
+from .compass_arena import CompassArenaSummarizer
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+# from .utils.writer import Writer
+
+
+def post_process_fofo(judgement: str):
+ """Input a string like below:
+
+ xxx[[5]]xxx, and extract the score
+ """
+ match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)", judgement)
+ if match:
+ score = int(match.group(1))
+ else:
+ return None
+
+ return {'score': score, 'judgement': judgement}
+
+
+class FofoSummarizer:
+ """Do the subjectivity analyze based on evaluation results.
+
+ Args:
+ config (ConfigDict): The configuration object of the evaluation task.
+ It's expected to be filled out at runtime.
+ """
+
+ def __init__(self, config: ConfigDict, judge_type='single') -> None:
+
+ self.tasks = []
+ self.cfg = config
+
+ self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
+ self.eval_model_abbrs = [
+ model_abbr_from_cfg(model) for model in self.eval_model_cfgs
+ ]
+
+ self.judge_models = self.cfg.get('judge_models', None)
+
+ self.judge_function = post_process_fofo
+
+ def get_score(self, time_str):
+ output_dir, results_folder = get_outdir(self.cfg, time_str)
+ total_scores = {}
+ for idx, judge_model_cfg in enumerate(self.judge_models):
+ judge_model = model_abbr_from_cfg(judge_model_cfg)
+ for dataset in self.cfg['datasets']:
+ dataset_abbr = dataset_abbr_from_cfg(dataset)
+ for eval_model_abbr in self.eval_model_abbrs:
+ subdir = eval_model_abbr + '_judged-by--' + judge_model
+ subdir_path = os.path.join(results_folder, subdir)
+ if os.path.isdir(subdir_path):
+ judged_answers, references = get_judgeanswer_and_reference(
+ dataset, subdir_path, self.judge_function)
+ scores = defaultdict(list)
+ for ans, ref in zip(judged_answers, references):
+ domain = ref['domain']
+ format_name = ref['format']
+ format_type = ref['format_type']
+ score = ans['score']
+ if score is not None:
+ scores['overall'].append(score)
+ scores[domain].append(score)
+ if format_type == 'general':
+ scores[format_name].append(score)
+ single_model_scores = {
+ task: sum(score) / len(score)
+ for task, score in scores.items()
+ }
+ if judge_model not in total_scores:
+ total_scores[judge_model] = {}
+ if dataset_abbr not in total_scores[judge_model]:
+ total_scores[judge_model][dataset_abbr] = {}
+ total_scores[judge_model][dataset_abbr][
+ eval_model_abbr] = single_model_scores
+ else:
+ print(subdir_path + ' is not exist! please check!')
+ return total_scores
+
+ def summarize(self,
+ time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
+ """Summarize the subjectivity analysis based on evaluation results.
+
+ Args:
+ time_str (str): Timestamp for file naming.
+
+ Returns:
+ pd.DataFrame: The summary results.
+ """
+ scores = self.get_score(time_str)
+ print(scores)
+ output_dir, results_folder = get_outdir(self.cfg, time_str)
+ for idx, judge_model in enumerate(self.judge_models):
+ judge_abbr = model_abbr_from_cfg(judge_model)
+ for dataset in self.cfg['datasets']:
+ dataset_abbr = dataset_abbr_from_cfg(dataset)
+ summarizer_model_abbrs = self.eval_model_abbrs
+ one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
+ format_types = ['Json', 'CSV', 'XML', 'YAML', 'Markdown']
+ row_headers = [
+ i for i in one_column.keys()
+ if i not in [dataset_abbr] + format_types
+ ]
+ row_headers = ['overall'] + format_types + row_headers
+ headers = [dataset_abbr] + summarizer_model_abbrs
+ table = []
+ for row_header in row_headers:
+ row = [row_header]
+ for model_abbr in summarizer_model_abbrs:
+ s = scores[judge_abbr][dataset_abbr][model_abbr].get(
+ row_header, '')
+ if isinstance(s, float):
+ s = f'{s:.2f}'
+ if isinstance(s, int):
+ s = str(s)
+ row.append(s)
+ table.append(row)
+ txt = tabulate(table, headers=headers)
+ print(txt)
+
+ if idx == len(self.judge_models):
+ output_filename = osp.join(
+ output_dir, 'summarized-by--' + judge_abbr + '-' +
+ dataset_abbr + '-report.csv')
+ else:
+ output_filename = osp.join(
+ output_dir, 'judged-by--' + judge_abbr + '-' +
+ dataset_abbr + '-report.csv')
+
+ with open(output_filename, 'w') as f:
+ f.write(','.join(headers) + '\n')
+ for line in table:
+ f.write(','.join(line) + '\n')
+ print(output_filename)
diff --git a/opencompass/summarizers/subjective/mtbench101.py b/opencompass/summarizers/subjective/mtbench101.py
new file mode 100644
index 00000000..a77f5617
--- /dev/null
+++ b/opencompass/summarizers/subjective/mtbench101.py
@@ -0,0 +1,142 @@
+# flake8: noqa: E501
+import csv
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+
+import numpy as np
+from mmengine import ConfigDict
+
+try:
+ from prettytable import from_csv
+except ImportError:
+ from_csv = None
+
+from opencompass.utils import model_abbr_from_cfg
+
+from .compass_arena import CompassArenaSummarizer
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+# from .utils.writer import Writer
+
+
+def post_process_mtbench_pair(judgement: str):
+ """Input a string like below:
+
+ xxx[[A]]xxx, and extract the judge
+ """
+ pattern = r'\[([A-C]+)\]'
+ matched_result = re.findall(pattern, judgement)
+ if matched_result:
+ return matched_result[0]
+ else:
+ return None
+
+
+def post_process_mtbench101(judgement: str):
+ """Input a string like below:
+
+ xxx[[5]]xxx, and extract the score
+ """
+ match = re.search(r'\[([0-9]+)\]', judgement)
+ if match:
+ score = int(match.group(1))
+
+ else:
+ return None
+
+ return {'score': score, 'judgement': judgement}
+
+
+def get_final_results(judged_answers, references, output_dir, fout_flag,
+ model):
+
+ task_multi_id_scores = defaultdict(list)
+ task_scores = defaultdict(list)
+
+ for ans, ref in zip(judged_answers, references):
+
+ task = ref['task']
+ multi_id = ref['multi_id']
+ score = ans['score']
+
+ task_multi_id_scores[(task, multi_id)].append(score)
+
+ for (task, multi_id), scores in task_multi_id_scores.items():
+ min_score = min(scores)
+ task_scores[task].append(min_score)
+
+ final_task_scores = {
+ task: sum(scores) / len(scores) if scores else 0
+ for task, scores in task_scores.items()
+ }
+
+ fout = osp.join(output_dir, 'task_score.csv')
+
+ columns = list(final_task_scores.keys())
+
+ print('================task_score=====================')
+ print(final_task_scores)
+
+ with open(fout, 'a+', newline='') as csvfile:
+
+ writer = csv.writer(csvfile)
+ if fout_flag == 0:
+ writer.writerow(['model'] + columns)
+ writer.writerow([model] +
+ [final_task_scores[column] for column in columns])
+ return 0
+
+
+class MTBench101Summarizer(CompassArenaSummarizer):
+ """Do the subjectivity analyze based on evaluation results.
+
+ Args:
+ config (ConfigDict): The configuration object of the evaluation task.
+ It's expected to be filled out at runtime.
+ """
+
+ def __init__(self, config: ConfigDict, judge_type='single') -> None:
+
+ self.tasks = []
+ self.cfg = config
+
+ self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
+ self.eval_model_abbrs = [
+ model_abbr_from_cfg(model) for model in self.eval_model_cfgs
+ ]
+
+ self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
+
+ self.judge_function = post_process_mtbench101
+
+ def summarize(self,
+ time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
+ """Summarize the subjectivity analysis based on evaluation results.
+
+ Args:
+ time_str (str): Timestamp for file naming.
+
+ Returns:
+ pd.DataFrame: The summary results.
+ """
+ dataset_cfgs = self.cfg['datasets']
+ output_dir, results_folder = get_outdir(self.cfg, time_str)
+ fout_flag = 0
+ for eval_model_abbr in self.eval_model_abbrs:
+ subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
+ subdir_path = os.path.join(results_folder, subdir)
+ if os.path.isdir(subdir_path):
+ model, judge_model = eval_model_abbr, self.judge_abbr
+
+ for dataset in dataset_cfgs:
+ print()
+ judged_answers, references = get_judgeanswer_and_reference(
+ dataset, subdir_path, self.judge_function)
+ get_final_results(judged_answers, references, output_dir,
+ fout_flag, model)
+ fout_flag += 1
+ else:
+ print(subdir_path + ' is not exist! please check!')
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index a11bb4cd..8d2494d8 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -214,6 +214,8 @@ class OpenICLEvalTask(BaseTask):
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
preds['test_set'] = test_set
+ if 'origin_prompt' not in preds:
+ preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
preds = {
k: preds[k]
for k in signature(icl_evaluator.score).parameters
diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py
index 496eec15..d65f6a03 100644
--- a/opencompass/utils/prompt.py
+++ b/opencompass/utils/prompt.py
@@ -2,6 +2,7 @@ from __future__ import annotations
import hashlib
import json
+import re
from copy import deepcopy
from typing import Dict, List, Union
@@ -19,9 +20,15 @@ def safe_format(input_str: str, **kwargs) -> str:
Returns:
str: The formatted string.
"""
+ segs = [input_str]
for k, v in kwargs.items():
- input_str = input_str.replace(f'{{{k}}}', str(v))
- return input_str
+ regex = re.compile(f'(?<={{{k}}})(?={{{k}}})|({{{k}}})')
+ segs = [regex.split(seg) for seg in segs]
+ segs = sum(segs, [])
+ replace_dict = {f'{{{k}}}': str(v) for k, v in kwargs.items()}
+ segs = [replace_dict.get(seg, seg) for seg in segs]
+ output_str = ''.join(segs)
+ return output_str
def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index fbcf60c3..5eaa0271 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -86,8 +86,14 @@ def get_config_from_arg(args) -> Config:
config['models'] = change_accelerator(config['models'], args.accelerator)
if config.get('eval', {}).get('partitioner', {}).get('models') is not None:
config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator)
+ if config.get('eval', {}).get('partitioner', {}).get('base_models') is not None:
+ config['eval']['partitioner']['base_models'] = change_accelerator(config['eval']['partitioner']['base_models'], args.accelerator)
+ if config.get('eval', {}).get('partitioner', {}).get('compare_models') is not None:
+ config['eval']['partitioner']['compare_models'] = change_accelerator(config['eval']['partitioner']['compare_models'], args.accelerator)
if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None:
config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator)
+ if config.get('judge_models') is not None:
+ config['judge_models'] = change_accelerator(config['judge_models'], args.accelerator)
return config
# parse dataset args
@@ -151,7 +157,7 @@ def get_config_from_arg(args) -> Config:
batch_size=args.batch_size,
pad_token_id=args.pad_token_id,
stop_words=args.stop_words,
- run_cfg=dict(num_gpus=args.num_gpus))
+ run_cfg=dict(num_gpus=args.hf_num_gpus))
logger.debug(f'Using model: {model}')
models.append(model)
# set infer accelerator if needed
@@ -211,7 +217,7 @@ def change_accelerator(models, accelerator):
mod = TurboMindModel
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
- abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
+ abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
path=model['path'],
engine_config=dict(session_len=model['max_seq_len'],
max_batch_size=model['batch_size'],
@@ -254,11 +260,11 @@ def change_accelerator(models, accelerator):
mod = VLLMwithChatTemplate
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
- abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-vllm',
+ abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
path=model['path'],
model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']),
max_out_len=model['max_out_len'],
- batch_size=32768,
+ batch_size=16,
run_cfg=model['run_cfg'],
stop_words=model.get('stop_words', []),
)
@@ -266,20 +272,21 @@ def change_accelerator(models, accelerator):
mod = TurboMindModelwithChatTemplate
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
- abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-turbomind',
+ abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
path=model['path'],
engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
max_seq_len=model.get('max_seq_len', 2048),
max_out_len=model['max_out_len'],
- batch_size=32768,
+ batch_size=16,
run_cfg=model['run_cfg'],
stop_words=model.get('stop_words', []),
)
else:
raise ValueError(f'Unsupported accelerator {accelerator} for model type {model["type"]}')
else:
- raise ValueError(f'Unsupported model type {model["type"]}')
+ acc_model = model
+ logger.warning(f'Unsupported model type {model["type"]}, will keep the original model.')
model_accels.append(acc_model)
return model_accels
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index b1b0c173..7baddbb4 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -63,15 +63,15 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
# yapf: disable
# flake8: noqa: W605
patterns = [
- f'答案是?\s?([{options}])',
- f'答案是?\s?:([{options}])',
- f'答案是?\s?:([{options}])',
- f'答案应该?是\s?([{options}])',
- f'答案应该?选\s?([{options}])',
- f'答案为\s?([{options}])',
- f'答案选\s?([{options}])',
- f'选择?\s?([{options}])',
- f'故选?\s?([{options}])'
+ f'答案是?\s*([{options}])',
+ f'答案是?\s*:\s*([{options}])',
+ f'答案是?\s*:\s*([{options}])',
+ f'答案应该?是\s*([{options}])',
+ f'答案应该?选\s*([{options}])',
+ f'答案为\s*([{options}])',
+ f'答案选\s*([{options}])',
+ f'选择?\s*([{options}])',
+ f'故选?\s*([{options}])'
f'只有选?项?\s?([{options}])\s?是?对',
f'只有选?项?\s?([{options}])\s?是?错',
f'只有选?项?\s?([{options}])\s?不?正确',
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 6b3409e7..84f88e6a 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -1,2 +1,3 @@
-alpaca-eval
+alpaca-eval==0.6
faiss_gpu==1.7.2
+scikit-learn==1.5
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 8452052e..b1c10ae8 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -22,6 +22,7 @@ OpenCC
opencv-python-headless
pandas<2.0.0
prettytable
+protobuf
pyext
pypinyin
python-Levenshtein