Resolve merge conflict with upstream

This commit is contained in:
root 2025-05-12 11:26:54 +00:00
commit b4fd65924a
90 changed files with 4564 additions and 165 deletions

View File

@ -122,12 +122,42 @@
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: ''
- MedCalc_Bench:
name: MedCalc_Bench
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2406.12036
configpath: opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
configpath_llmjudge: ''
- MedXpertQA:
name: MedQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2009.13081
configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py
configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py
- MedXpertQA:
name: MedXpertQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2501.18362
configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
- ClinicBench:
name: ClinicBench
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2405.00716
configpath: ''
configpath_llmjudge: opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
- ScienceQA:
name: ScienceQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2209.09513
configpath: ''
configpath_llmjudge: opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
- PubMedQA:
name: PubMedQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/1909.06146
configpath: ''
configpath_llmjudge: opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
- musr:
name: MuSR
category: Reasoning
@ -343,6 +373,12 @@
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
configpath_llmjudge: ''
- CARDBiomedBench:
name: CARDBiomedBench
category: Knowledge / Medicine
paper: https://www.biorxiv.org/content/10.1101/2025.01.15.633272v1
configpath: opencompass/configs/datasets/CARDBiomedBench
configpath_llmjudge: 'opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py'
- cb:
name: SuperGLUE / CB
category: Reasoning
@ -575,6 +611,12 @@
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
configpath_llmjudge: ''
- humaneval_pro:
name: HumanEval Pro
category: Code
paper: https://arxiv.org/abs/2412.21199
configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
configpath_llmjudge: ''
- hungarian_math:
name: Hungarian_Math
category: Math
@ -659,6 +701,12 @@
paper: ''
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
configpath_llmjudge: ''
- mbpp_pro:
name: MBPP Pro
category: Code
paper: https://arxiv.org/abs/2412.21199
configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
configpath_llmjudge: ''
- mgsm:
name: MGSM
category: Language / Math
@ -745,6 +793,12 @@
paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
configpath_llmjudge: ''
- ProteinLMBench:
name: ProteinLMBench
category: Knowledge / Biology (Protein)
paper: https://arxiv.org/abs/2406.05540
configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py
configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py
- py150:
name: py150
category: Code
@ -1029,3 +1083,21 @@
paper: https://arxiv.org/pdf/2402.09391
configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
configpath_llmjudge: ''
- SciKnowEval:
name: SciKnowEval
category: Science
paper: https://arxiv.org/abs/2406.09098
configpath: opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
configpath_llmjudge: opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
- internsandbox:
name: InternSandbox
category: Reasoning/Code/Agent
paper: ''
configpath: opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
configpath_llmjudge: ''
- nejmaibench:
name: nejmaibench
category: Science /Medicine
paper: https://arxiv.org/pdf/2308.04709
configpath: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
configpath_llmjudge: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py

View File

@ -0,0 +1,61 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
from opencompass.configs.summarizers.judgedataset_all import summarizer
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.models import TurboMindModelwithChatTemplate
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judge_dataset_all/'

View File

@ -0,0 +1,52 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_judgebench_datasets]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judgebench/'

View File

@ -0,0 +1,53 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
from opencompass.configs.summarizers.judgerbenchv2 import summarizer
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_judgerbenchv2_dataset]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
# partitioner=dict(type=NaivePartitioner),
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judgerbenchv2/'

View File

@ -0,0 +1,101 @@
from opencompass.datasets import CARDBiomedBenchDataset
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
ZERO_SHOT_PROMPT = 'You are an expert in {expert}.\n{question}\n'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: You are an expert in {expert}.\n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'answer',
'Bio_Category',
'SQL_Category',
'uuid',
'template uuid',
'expert',
],
output_column='answer',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CARDBiomedBenchDataset,
path='NIH-CARD/CARDBiomedBench',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
cardbiomedbench_dataset = dict(
type=CARDBiomedBenchDataset,
abbr='cardbiomedbench',
path='NIH-CARD/CARDBiomedBench',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
cardbiomedbench_datasets = [cardbiomedbench_dataset]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .ClinicBench_llmjudge_gen_d09668 import ClinicBench_datasets

View File

@ -0,0 +1,100 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.ClinicBench import ClinicBenchDataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
ClinicBench_datasets = []
ClinicBench_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
ClinicBench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ClinicBench_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=ClinicBenchDataset,
path='xuxuxuxuxu/Pharmacology-QA',
reader_cfg=ClinicBench_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
ClinicBench_datasets.append(
dict(
abbr=f'ClinicBench',
type=ClinicBenchDataset,
path='xuxuxuxuxu/Pharmacology-QA',
reader_cfg=ClinicBench_reader_cfg,
infer_cfg=ClinicBench_infer_cfg,
eval_cfg=ClinicBench_eval_cfg,
)
)

View File

@ -0,0 +1,88 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import HLEDataset
# ----------------------------- Detailed Config -----------------------------
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=HLEDataset,
path='cais/hle',
reader_cfg=math_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
hle_datasets = [
dict(
type=HLEDataset,
abbr='hle_llmjudge',
path='cais/hle',
category='Biology/Medicine',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,57 @@
from opencompass.datasets import MedCalc_BenchDataset, MedCalcOfficial_Evaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
ZERO_SHOT_PROMPT = 'You are a helpful assistant for calculating a score for a given patient note. Please think step-by-step to solve the question and then generate the required score. Your output should only contain a JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}. \n Here is the patient note:\n{patient_note}\n\nHere is the task:\n{question}\n\nPlease directly output the JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}:'
# Reader configuration
reader_cfg = dict(
input_columns=[
'row_number',
'calculator_id',
'calculator_name',
'category',
'note_id',
'output_type',
'note_type',
'patient_note',
'question',
'relevant_entities',
'ground_truth_answer',
'lower_limit',
'upper_limit',
'ground_truth_explanation'
],
output_column='ground_truth_answer',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN',prompt=ZERO_SHOT_PROMPT),
])
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=MedCalcOfficial_Evaluator),
pred_role='BOT',
)
medcal_bench_dataset = dict(
type=MedCalc_BenchDataset,
abbr='medcal_bench_official_zero_shot_eval',
path='ncbi/MedCalc-Bench-v1.0',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
medcal_bench_datasets = [medcal_bench_dataset]

View File

@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.utils.text_postprocessors import first_option_postprocess
from opencompass.datasets.MedQA import MedQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
MedQA_datasets = []
MedQA_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
MedQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
MedQA_subsets = {
'US': 'xuxuxuxuxu/MedQA_US_test',
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
}
for split in list(MedQA_subsets.keys()):
MedQA_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
)
MedQA_datasets.append(
dict(
abbr=f'MedQA_{split}',
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
infer_cfg=MedQA_infer_cfg,
eval_cfg=MedQA_eval_cfg,
)
)

View File

@ -0,0 +1,108 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.MedQA import MedQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
MedQA_datasets = []
MedQA_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
MedQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
MedQA_subsets = {
'US': 'xuxuxuxuxu/MedQA_US_test',
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
}
for split in list(MedQA_subsets.keys()):
MedQA_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
MedQA_datasets.append(
dict(
abbr=f'MedQA_{split}',
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
infer_cfg=MedQA_infer_cfg,
eval_cfg=MedQA_eval_cfg,
)
)

View File

@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset, ProteinLMBenchEvaluator
QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
# Reader configuration
reader_cfg = dict(
input_columns=['question', 'start', 'end', 'options'],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=QUERY_TEMPLATE
)
], ),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=ProteinLMBenchEvaluator),
)
proteinlmbench_dataset = dict(
abbr='ProteinLMBench',
type=ProteinLMBenchDataset,
path='tsynbio/ProteinLMBench',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg
)
proteinlmbench_datasets = [proteinlmbench_dataset]

View File

@ -0,0 +1,89 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset
QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
reader_cfg = dict(
input_columns=['question', 'start', 'end', 'options'],
output_column='label',
)
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=ProteinLMBenchDataset,
path='tsynbio/ProteinLMBench',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
proteinlmbench_dataset = dict(
abbr='ProteinLMBench',
type=ProteinLMBenchDataset,
path='tsynbio/ProteinLMBench',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg
)
proteinlmbench_datasets = [proteinlmbench_dataset]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .PubMedQA_llmjudge_gen_f00302 import PubMedQA_datasets

View File

@ -0,0 +1,94 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.PubMedQA import PubMedQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
PubMedQA_datasets = []
PubMedQA_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
PubMedQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
PubMedQA_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=PubMedQADataset,
path='qiaojin/PubMedQA',
reader_cfg=PubMedQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
PubMedQA_datasets.append(
dict(
abbr=f'PubMedQA',
type=PubMedQADataset,
path='qiaojin/PubMedQA',
reader_cfg=PubMedQA_reader_cfg,
infer_cfg=PubMedQA_infer_cfg,
eval_cfg=PubMedQA_eval_cfg,
)
)

View File

@ -0,0 +1,92 @@
from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
ZERO_SHOT_PROMPT = '{q4}'
# Reader configuration
reader_cfg = dict(
input_columns=[
'prompt',
'question',
'choices',
'label',
'answerKey',
'type',
'domain',
'details',
'answer',
'q4'
],
output_column='answerKey',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=SciKnowEvalEvaluator),
pred_role='BOT',
)
sciknoweval_dataset_biology = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_biology',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_chemistry = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_chemistry',
path='hicai-zju/SciKnowEval',
subset='chemistry',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_material = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_material',
path='hicai-zju/SciKnowEval',
subset='material',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_physics = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_physics',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='physics',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]

View File

@ -0,0 +1,232 @@
from opencompass.datasets import SciKnowEvalDataset
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
ZERO_SHOT_PROMPT = '{q4}'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: {q4}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answerKey}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'prompt',
'question',
'choices',
'label',
'answerKey',
'type',
'domain',
'details',
'answer',
'q4'
],
output_column='answerKey',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg_biology = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_chemistry = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='chemistry',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_material = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='material',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_physics = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='physics',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
sciknoweval_dataset_biology = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_biology_llmjudge',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_biology,
)
sciknoweval_dataset_chemistry = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_chemistry_llmjudge',
path='hicai-zju/SciKnowEval',
subset='chemistry',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_chemistry,
)
sciknoweval_dataset_material = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_material_llmjudge',
path='hicai-zju/SciKnowEval',
subset='material',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_material,
)
sciknoweval_dataset_physics = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_physics_llmjudge',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='physics',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_physics,
)
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .ScienceQA_llmjudge_gen_f00302 import ScienceQA_datasets

View File

@ -0,0 +1,94 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.ScienceQA import ScienceQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
ScienceQA_datasets = []
ScienceQA_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
ScienceQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ScienceQA_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=ScienceQADataset,
path='derek-thomas/ScienceQA',
reader_cfg=ScienceQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
ScienceQA_datasets.append(
dict(
abbr=f'ScienceQA',
type=ScienceQADataset,
path='derek-thomas/ScienceQA',
reader_cfg=ScienceQA_reader_cfg,
infer_cfg=ScienceQA_infer_cfg,
eval_cfg=ScienceQA_eval_cfg,
)
)

View File

@ -0,0 +1,17 @@
# HumanEval pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 67 |
| deepseek-v2-lite-chat-hf | 35 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 28 |

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_pro_gen_3dc067 import humanevalpro_datasets # noqa: F401, F403

View File

@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
humanevalpro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
humanevalpro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalpro_eval_cfg = dict(
evaluator=dict(type=HumanevalProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space')
)
humanevalpro_datasets = [
dict(
abbr='humaneval_pro',
type=HumanevalevalProDataset,
path='opencompass/humaneval_pro',
reader_cfg=humanevalpro_reader_cfg,
infer_cfg=humanevalpro_infer_cfg,
eval_cfg=humanevalpro_eval_cfg,)
]

View File

@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
humanevalpro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
humanevalpro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalpro_eval_cfg = dict(
evaluator=dict(type=HumanevalProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space')
)
humanevalpro_datasets = [
dict(
abbr='humaneval_pro',
type=HumanevalevalProDataset,
path='opencompass/humaneval_pro',
reader_cfg=humanevalpro_reader_cfg,
infer_cfg=humanevalpro_infer_cfg,
eval_cfg=humanevalpro_eval_cfg,
n=5,
k=3)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .internsandbox_gen_44b982 import internsandbox_datasets

View File

@ -0,0 +1,59 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import InternSandboxDataset, InternSandboxEvaluator
_SANDBOXS_ = ['aquarium', 'arc', 'arrowmaze', 'bbehboardgameqa', 'bbehbooleanexpressions', 'BbehDyckLanguages', 'BbehGeometricShapes', 'BbehMultistepArithmetic', 'bbehobjectcounting', 'bbehobjectproperties', 'bbehshuffobject', 'BbehWebOfLies', 'BbehWordSorting', 'binairo', 'calcudoku', 'campsite', 'cipher', 'cryptomath', 'dominosa', 'futoshiki', 'galaxies', 'game24', 'kakurasu', 'korLogicAnalogicalReasoning', 'korLogicCanonicalPropositions', 'korLogicCooperativePrinciple', 'korLogicDefinitions', 'korLogicDerivativeReasoningOfPropositionalLogic', 'korLogicDisjunctiveNormalFormAndConjunctiveNormalForm', 'korLogicDynamicLogic', 'korLogicEnumerativeInductiveReasoning', 'korLogicEpistemicLogic', 'korLogicEquivalenceCalculus', 'korLogicFigureOfTheSyllogism', 'korLogicFormalFallacies', 'korLogicInductionParadox', 'korLogicLogicalMethodsForExploringCauseAndEffectRelationships', 'korLogicPredicateLogicFormalization', 'korLogicPropositionalLogicConcepts', 'korLogicPropositionalLogicFormalization', 'korLogicResolution', 'korLogicSpeechActs', 'korLogicStatisticalReasoning', 'korLogicTemporalPropositions', 'korLogicTruthValueModalPropositions', 'korOperationUnicode20ac', 'korOperationUnicode2295', 'korOperationUnicode25a0', 'korOperationUnicode25a1', 'korOperationUnicode25b3', 'korOperationUnicode25bd', 'korOperationUnicode25cb', 'korOperationUnicode25ce', 'korOperationUnicode25cf', 'korOperationUnicode2605', 'korOperationUnicodeffe0', 'korOperationUnicodeffe1', 'korPuzzle24Points', 'korPuzzleArrowMaze', 'korPuzzleCalcudoko', 'korPuzzleCampsite', 'korPuzzleConnectWords', 'korPuzzleCryptoMath', 'korPuzzleKukurasu', 'korPuzzleLogicPuzzle', 'korPuzzleSkyscrapers', 'korPuzzleWordBrainTeasers', 'korPuzzleWordLadder', 'korPuzzleWordRootsAndAffixes', 'korPuzzleWordscapes', 'korPuzzleWordSearch', 'LightUp', 'maze', 'minesweeper', 'nonograms', 'starbattle', 'stitches', 'sudoku', 'tents', 'thermometers']
internsandbox_reader_cfg = dict(
input_columns=['prompt'],
output_column='ground_truth'
)
internsandbox_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a helpful assistant.',
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
internsandbox_eval_cfg = {
sandbox: dict(
evaluator=dict(
type=InternSandboxEvaluator,
short_penalty=False,
format_penalty=False,
),
pred_role='BOT',
) for sandbox in _SANDBOXS_
}
internsandbox_datasets = [
dict(
type=InternSandboxDataset,
abbr=f'internsandbox-{sandbox}',
path='./data/InternSandboxBenchmark_verified_V0.3.1/',
local_mode=True,
sandbox=sandbox,
reader_cfg=internsandbox_reader_cfg,
infer_cfg=internsandbox_infer_cfg,
eval_cfg=internsandbox_eval_cfg[sandbox],
) for sandbox in _SANDBOXS_
]

View File

@ -0,0 +1,71 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import JudgeEvaluator
from opencompass.datasets import JudgeBenchDataset
subjective_reader_cfg = dict(
input_columns=['prompt'],
output_column='judge',
)
data_path = './data/judgeeval/judgebench'
subjective_all_sets = ['judgebench.json']
get_judgebench_datasets = []
prompt_choice_prefix = """
Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
- Do not let the order of presentation, response length, or assistant names influence your judgment.
- Base your decision solely on how well each response addresses the users question and adheres to the instructions.
Your final reply must be structured in the following format:
{
"Choice": "[Model A or Model B]"
}
"""
prompt_choice_en = """User Question: {question}
Model A's Response: {answerA}
Model B's Response: {answerB}
Now it's your turn. Please provide selection result as required:
"""
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=prompt_choice_prefix + prompt_choice_en
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
rewardbench_eval_cfg = dict(
evaluator=dict(
type=JudgeEvaluator,
),
)
get_judgebench_datasets.append(
dict(
abbr=f'{_name.split(".")[0]}',
type=JudgeBenchDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=rewardbench_eval_cfg,
mode='singlescore',
))

View File

@ -0,0 +1,47 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import Judgerbenchv2Evaluator
from opencompass.datasets import Judgerbenchv2Dataset
judgerbenchv2_reader_cfg = dict(
input_columns=['prompt'],
output_column='judge',
)
data_path = './data/judgeeval/judgerbenchv2'
judgerbenchv2_all_sets = ['Knowledge', 'Longtext', 'Reason_and_analysis', 'safe', 'Hallucination', 'chatQA', 'IF', 'LanTask', 'Creation', 'Code_and_AI']
get_judgerbenchv2_dataset = []
for _name in judgerbenchv2_all_sets:
judgerbenchv2_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
judgerbenchv2_eval_cfg = dict(
evaluator=dict(
type=Judgerbenchv2Evaluator,
),
)
get_judgerbenchv2_dataset.append(
dict(
abbr=f'{_name}',
type=Judgerbenchv2Dataset,
path=data_path,
name=_name,
reader_cfg=judgerbenchv2_reader_cfg,
infer_cfg=judgerbenchv2_infer_cfg,
eval_cfg=judgerbenchv2_eval_cfg,
))

View File

@ -0,0 +1,17 @@
# MBPP pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 66 |
| qwen2.5-14b-instruct-hf | 64 |
| deepseek-v2-lite-chat-hf | 36 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 39 |

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mbpp_pro_gen_3dc067 import mbpppro_datasets # noqa: F401, F403

View File

@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
mbpppro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
mbpppro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
mbpppro_eval_cfg = dict(
evaluator=dict(type=MBPPProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space'),
)
mbpppro_datasets = [
dict(
abbr='mbpp_pro',
type=MBPPProDataset,
path='opencompass/mbpp_pro',
reader_cfg=mbpppro_reader_cfg,
infer_cfg=mbpppro_infer_cfg,
eval_cfg=mbpppro_eval_cfg)
]

View File

@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
mbpppro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
mbpppro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
mbpppro_eval_cfg = dict(
evaluator=dict(type=MBPPProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space'),
)
mbpppro_datasets = [
dict(
abbr='mbpp_pro',
type=MBPPProDataset,
path='opencompass/mbpp_pro',
reader_cfg=mbpppro_reader_cfg,
infer_cfg=mbpppro_infer_cfg,
eval_cfg=mbpppro_eval_cfg,
n=5,
k=3)
]

View File

@ -0,0 +1,60 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MMLUProDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
categories = [
'health',
]
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{options_str}
""".strip()
mmlu_pro_datasets = []
for category in categories:
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN',
prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern,
answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))

View File

@ -0,0 +1,101 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess
categories = [
'health',
]
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{options_str}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
mmlu_pro_datasets = []
for category in categories:
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .multiple_top_ten_gen_f44aaf import multiple_datasets # noqa: F401, F403

View File

@ -32,7 +32,6 @@ multiple_datasets = [
type=MultiplEDataset,
abbr=f'humaneval-multiple-{lang}',
language=lang,
num_repeats=1,
path='opencompass/multipl_e',
tag='humaneval',
reader_cfg=multiple_reader_cfg,
@ -46,7 +45,6 @@ multiple_datasets += [
type=MultiplEDataset,
abbr=f'mbpp-multiple-{lang}',
language=lang,
num_repeats=1,
path='opencompass/multipl_e',
tag='mbpp',
reader_cfg=multiple_reader_cfg,

View File

@ -0,0 +1,58 @@
# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
_TOP_TEN_LANGUAGE_ = ['cpp']
multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
multiple_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
multiple_eval_cfg = {
lang: dict(
evaluator=dict(
type=MultiplEEvaluator,
language=lang,
ip_address='https://opencompass-multiple-evaluator.hf.space',
),
pred_role='BOT',
) for lang in _TOP_TEN_LANGUAGE_
}
multiple_datasets = [
dict(
type=MultiplEDataset,
abbr=f'humaneval-multiple-{lang}',
language=lang,
path='opencompass/multipl_e',
tag='humaneval',
reader_cfg=multiple_reader_cfg,
infer_cfg=multiple_infer_cfg,
eval_cfg=multiple_eval_cfg[lang],
n=5,
k=3
) for lang in _TOP_TEN_LANGUAGE_
]
multiple_datasets += [
dict(
type=MultiplEDataset,
abbr=f'mbpp-multiple-{lang}',
language=lang,
path='opencompass/multipl_e',
tag='mbpp',
reader_cfg=multiple_reader_cfg,
infer_cfg=multiple_infer_cfg,
eval_cfg=multiple_eval_cfg[lang],
n=5,
k=3
) for lang in _TOP_TEN_LANGUAGE_
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .nejmaibench_gen_60c8f5 import nejmaibench_datasets # noqa: F401, F403

View File

@ -0,0 +1,59 @@
from opencompass.datasets import NejmaibenchDataset, NejmaibenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
import os
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'Subject',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=NejmaibenchEvaluator),
pred_role='BOT',
)
nejmaibench_dataset = dict(
type=NejmaibenchDataset,
abbr='nejmaibench',
path='opencompass/nejmaibench',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
nejmaibench_datasets = [nejmaibench_dataset]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .nejmaibench_llmjudge_gen_60c8f5 import nejmaibench_datasets # noqa: F401, F403

View File

@ -0,0 +1,108 @@
from opencompass.datasets import NejmaibenchDataset
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.evaluator import GenericLLMEvaluator
import os
SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'question',
'options',
'Subject',
'prompt_mode',
],
output_column='label',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
],
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=NejmaibenchDataset,
path='opencompass/nejmaibench',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
nejmaibench_dataset = dict(
type=NejmaibenchDataset,
abbr='nejmaibench',
path='opencompass/nejmaibench',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
nejmaibench_datasets = [nejmaibench_dataset]

View File

@ -0,0 +1,69 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'writingbench'
]
writingbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
multi_eval=True,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert evaluator with extensive experience in evaluating response of given query.')
],
round=[
dict(
role='HUMAN',
prompt = '{prediction}'
),
]),
),
dict_postprocessor=dict(type=writingbench_postprocess),
),
pred_role='BOT',
)
writingbench_datasets.append(
dict(
abbr=f'{_name}',
type=WritingBenchDataset,
path='./data/subjective/writingbench',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -0,0 +1,14 @@
import torch
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='baichuan-m1-14b-base-hf',
path='baichuan-inc/Baichuan-M1-14B-Base',
max_out_len=1024,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,14 @@
import torch
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='baichuan-m1-14b-instruct-hf',
path='baichuan-inc/Baichuan-M1-14B-Instruct',
max_out_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,16 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='gemma-3-12b-it-vllm',
path='google/gemma-3-12b-it',
model_kwargs=dict(tensor_parallel_size=4,
# for long context
rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,16 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='gemma-3-27b-it-vllm',
path='google/gemma-3-27b-it',
model_kwargs=dict(tensor_parallel_size=4,
# for long context
rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,17 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='gemma-3-4b-it-vllm',
path='google/gemma-3-4b-it',
model_kwargs=dict(tensor_parallel_size=2,
# for long context
rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
max_seq_len=140000,
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,19 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm3-8b-instruct-turbomind',
path='internlm/internlm3-8b-instruct',
engine_config=dict(session_len=142000, max_batch_size=1, tp=2,
# for long context
rope_scaling_factor=6.0),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192
),
max_seq_len=142000,
max_out_len=8192,
batch_size=1,
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='OREAL-32B',
path='internlm/OREAL-32B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=4),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,17 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='huatuogpt2-13b-hf',
path='FreedomIntelligence/HuatuoGPT2-13B',
tokenizer_kwargs=dict(padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=True,),
max_out_len=1024,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,13 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='huatuogpt2-7b-hf',
path='FreedomIntelligence/HuatuoGPT2-7B',
max_out_len=1024,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='huatuogpt-o1-7b-hf',
path='FreedomIntelligence/HuatuoGPT-o1-7B',
max_out_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='huatuogpt-o1-8b-hf',
path='FreedomIntelligence/HuatuoGPT-o1-8B',
max_out_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internvl2_5-38b-turbomind',
path='OpenGVLab/InternVL2_5-38B',
engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=8192,
max_out_len=8192,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internvl2_5-8b-turbomind',
path='OpenGVLab/InternVL2_5-8B',
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
max_seq_len=8192,
max_out_len=8192,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,21 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='qwen2.5-14b-instruct-vllm',
path='Qwen/Qwen2.5-14B-Instruct',
model_kwargs=dict(
tensor_parallel_size=4,
rope_scaling={
'factor': 4.0,
'original_max_position_embeddings': 32768,
'rope_type': 'yarn'
},
),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,21 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='qwen2.5-32b-instruct-vllm',
path='Qwen/Qwen2.5-32B-Instruct',
model_kwargs=dict(
tensor_parallel_size=8,
rope_scaling={
'factor': 4.0,
'original_max_position_embeddings': 32768,
'rope_type': 'yarn'
},
),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,21 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='qwen2_5-72b-instruct-vllm',
path='Qwen/Qwen2.5-72B-Instruct',
model_kwargs=dict(
tensor_parallel_size=8,
rope_scaling={
'factor': 4.0,
'original_max_position_embeddings': 32768,
'rope_type': 'yarn'
},
),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,21 @@
from opencompass.models import VLLMwithChatTemplate
models = [
dict(
type=VLLMwithChatTemplate,
abbr='qwen2.5-7b-instruct-vllm',
path='Qwen/Qwen2.5-7B-Instruct',
model_kwargs=dict(
tensor_parallel_size=4,
rope_scaling={
'factor': 4.0,
'original_max_position_embeddings': 32768,
'rope_type': 'yarn'
},
),
max_out_len=4096,
batch_size=1,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,90 @@
Judge_all_summary_groups = []
# RewardBench
_Chat_weights = {
'alpacaeval-easy': 0.32355305466237944,
'alpacaeval-length': 0.32355305466237944,
'alpacaeval-hard': 0.32355305466237944,
'mt-bench-easy': 0.011254019292604502,
'mt-bench-med': 0.018086816720257234,
}
_Chat_Hard_weights = {
'mt-bench-hard': 0.09698275862068965,
'llmbar-natural': 0.21551724137931033,
'llmbar-adver-neighbor': 0.28879310344827586,
'llmbar-adver-GPTInst': 0.19827586206896552,
'llmbar-adver-GPTOut': 0.10129310344827586,
'llmbar-adver-manual': 0.09913793103448276,
}
_Safety_weights = {
'refusals-dangerous': 0.13513513513513514,
'refusals-offensive': 0.13513513513513514,
'xstest-should-refuse': 0.20810810810810812,
'xstest-should-respond': 0.33783783783783783,
'donotanswer': 0.1837837837837838,
}
_Reasoning_weights = {
'math-prm': 0.31236897274633124,
'hep-cpp': 0.1146051712089448,
'hep-go': 0.1146051712089448,
'hep-java': 0.1146051712089448,
'hep-js': 0.1146051712089448,
'hep-python': 0.1146051712089448,
'hep-rust': 0.1146051712089448,
}
_RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
Judge_all_summary_groups.append({'name': 'RewardBench_avg', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
Judge_all_summary_groups.append({'name': 'RewardBench_Chat', 'subsets': list(_Chat_weights.keys()), 'weights': _Chat_weights})
Judge_all_summary_groups.append({'name': 'RewardBench_Chat Hard', 'subsets': list(_Chat_Hard_weights.keys()), 'weights': _Chat_Hard_weights})
Judge_all_summary_groups.append({'name': 'RewardBench_Safety', 'subsets': list(_Safety_weights.keys()), 'weights': _Safety_weights})
Judge_all_summary_groups.append({'name': 'RewardBench_Reasoning', 'subsets': list(_Reasoning_weights.keys()), 'weights': _Reasoning_weights})
# Judgerbenchv2
Judgerbenchv2_tasks = ['Code_and_AI', 'Creation', 'LanTask', 'IF', 'chatQA', 'Hallucination', 'safe', 'Reason_and_analysis', 'Longtext', 'Knowledge']
Judgerbenchv2_metrics = ['final_score', 'accuracy', 'normalized_diff', 'rank_diff', 'score_diff']
Judgerbenchv2_summary_names = []
for metric in Judgerbenchv2_metrics:
for task in Judgerbenchv2_tasks:
Judgerbenchv2_summary_names.append([task, metric])
Judge_all_summary_groups.append({'name': 'Judgerbenchv2_final_score', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'final_score']})
Judge_all_summary_groups.append({'name': 'Judgerbenchv2_accuracy', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'accuracy']})
Judge_all_summary_groups.append({'name': 'Judgerbenchv2_normalized_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'normalized_diff']})
Judge_all_summary_groups.append({'name': 'Judgerbenchv2_rank_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'rank_diff']})
Judge_all_summary_groups.append({'name': 'Judgerbenchv2_score_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'score_diff']})
Judge_all_summary_groups.append({'name': 'Judgebench', 'subsets': ['judgebench']})
Judge_all_summary_groups.append({'name': 'rmb_dataset_total_avg', 'subsets': [['rmb_dataset', 'total_accuracy']]})
Judge_all_summary_groups.append({'name': 'rmb_dataset_pair', 'subsets': [['rmb_dataset', 'pair_average']]})
Judge_all_summary_groups.append({'name': 'rmb_dataset_bon', 'subsets': [['rmb_dataset', 'bon_average']]})
summarizer = dict(
dataset_abbrs=[
'Judgerbenchv2_final_score',
'Judgebench',
'rmb_dataset_total_avg',
'RewardBench_avg',
'',
'Judgerbenchv2_accuracy',
'Judgerbenchv2_normalized_diff',
'Judgerbenchv2_rank_diff',
'Judgerbenchv2_score_diff',
'',
'rmb_dataset_pair',
'rmb_dataset_bon',
'',
'RewardBench_Chat',
'RewardBench_Chat Hard',
'RewardBench_Safety',
'RewardBench_Reasoning',
],
summary_groups=Judge_all_summary_groups,
)

View File

@ -0,0 +1,16 @@
tasks = ['Code_and_AI', 'Creation', 'LanTask', 'IF', 'chatQA', 'Hallucination', 'safe', 'Reason_and_analysis', 'Longtext', 'Knowledge']
Judgerbenchv2_summary_names = [[task, 'final_score'] for task in tasks]
Judgerbenchv2_summary_groups = [
{'name': 'Judgerbenchv2', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names]}
]
summarizer = dict(
dataset_abbrs=[
'Judgerbenchv2'
],
summary_groups=Judgerbenchv2_summary_groups,
)

View File

@ -1,10 +1,53 @@
RewardBench_summary_groups = []
_Chat_weights = {
'alpacaeval-easy': 0.32355305466237944,
'alpacaeval-length': 0.32355305466237944,
'alpacaeval-hard': 0.32355305466237944,
'mt-bench-easy': 0.011254019292604502,
'mt-bench-med': 0.018086816720257234,
}
_Chat_Hard_weights = {
'mt-bench-hard': 0.09698275862068965,
'llmbar-natural': 0.21551724137931033,
'llmbar-adver-neighbor': 0.28879310344827586,
'llmbar-adver-GPTInst': 0.19827586206896552,
'llmbar-adver-GPTOut': 0.10129310344827586,
'llmbar-adver-manual': 0.09913793103448276,
}
_Safety_weights = {
'refusals-dangerous': 0.13513513513513514,
'refusals-offensive': 0.13513513513513514,
'xstest-should-refuse': 0.20810810810810812,
'xstest-should-respond': 0.33783783783783783,
'donotanswer': 0.1837837837837838,
}
_Reasoning_weights = {
'math-prm': 0.31236897274633124,
'hep-cpp': 0.1146051712089448,
'hep-go': 0.1146051712089448,
'hep-java': 0.1146051712089448,
'hep-js': 0.1146051712089448,
'hep-python': 0.1146051712089448,
'hep-rust': 0.1146051712089448,
}
_RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
RewardBench_summary_groups.append({'name': 'Chat', 'subsets': list(_Chat_weights.keys()), 'weights': _Chat_weights})
RewardBench_summary_groups.append({'name': 'Chat Hard', 'subsets': list(_Chat_Hard_weights.keys()), 'weights': _Chat_Hard_weights})
RewardBench_summary_groups.append({'name': 'Safety', 'subsets': list(_Safety_weights.keys()), 'weights': _Safety_weights})
RewardBench_summary_groups.append({'name': 'Reasoning', 'subsets': list(_Reasoning_weights.keys()), 'weights': _Reasoning_weights})
RewardBench_summary_groups.append({'name': 'RewardBench', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
summarizer = dict(
dataset_abbrs=[
'Chat',
'Chat Hard',
'Safety',
'Reasoning',
'RewardBench'
],
summary_groups=RewardBench_summary_groups,

View File

@ -0,0 +1,30 @@
from datasets import load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
def _parse(item, prompt_mode):
item['expert'] = item['Bio_Category']
item['start'] = chr(65)
item['end'] = chr(65 + len(item.get('choices', {'label': []})['label']) -
1)
item['prompt_mode'] = prompt_mode
return item
@LOAD_DATASET.register_module()
class CARDBiomedBenchDataset(BaseDataset):
@staticmethod
def load(path: str, prompt_mode: str, **kwargs):
data_files = {'test': 'data/CARDBiomedBench.csv'}
dataset = load_dataset(path, data_files=data_files, split='test')
# dataset = dataset.select(range(200))
if prompt_mode == 'zero-shot':
dataset = dataset.map(lambda item: _parse(item, prompt_mode),
load_from_cache_file=False)
elif prompt_mode == 'few-shot':
pass # TODO: Implement few-shot prompt
return dataset

View File

@ -0,0 +1,19 @@
from datasets import load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class ClinicBenchDataset(BaseDataset):
@staticmethod
def load_single(path):
dataset = load_dataset(path)['train']
return dataset
@staticmethod
def load(path):
dataset = ClinicBenchDataset.load_single(path)
return dataset

View File

@ -0,0 +1,323 @@
import math
import re
from datetime import datetime
import numpy as np
from datasets import load_dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
def check_correctness(answer: str, ground_truth, calid, upper_limit,
lower_limit):
""""""
calid = int(calid)
if calid in [13, 68]:
# Output Type: date
if datetime.strptime(
answer,
'%m/%d/%Y').strftime('%-m/%-d/%Y') == datetime.strptime(
ground_truth, '%m/%d/%Y').strftime('%-m/%-d/%Y'):
correctness = 1
else:
correctness = 0
elif calid in [69]:
# Output Type: integer (A, B)
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?"
r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", ground_truth)
ground_truth = f'({match.group(1)}, {match.group(3)})'
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?"
r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", answer)
if match:
weeks = match.group(1)
days = match.group(3)
answer = f'({weeks}, {days})'
if eval(answer) == eval(ground_truth):
correctness = 1
else:
correctness = 0
else:
correctness = 0
elif calid in [
4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48,
51, 69
]:
# Output Type: integer A
answer = round(eval(answer))
if answer == eval(ground_truth):
correctness = 1
else:
correctness = 0
elif calid in [
2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39,
40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67
]:
# Output Type: decimal
answer = eval(answer)
if answer >= eval(lower_limit) and answer <= eval(upper_limit):
correctness = 1
else:
correctness = 0
else:
raise ValueError(f'Unknown calculator ID: {calid}')
return correctness
def extract_answer(answer, calid):
calid = int(calid)
extracted_answer = re.findall(r'[Aa]nswer":\s*(.*?)\}', answer)
matches = re.findall(
r'"step_by_step_thinking":\s*"'
r'([^"]+)"\s*,\s*"[Aa]nswer"', answer)
if matches:
# Select the last match
last_match = matches[-1]
explanation = last_match
else:
explanation = 'No Explanation'
if len(extracted_answer) == 0:
extracted_answer = 'Not Found'
else:
extracted_answer = extracted_answer[-1].strip().strip('"')
if extracted_answer == 'str(short_and_direct\
_answer_of_the_question)':
extracted_answer = 'Not Found'
if extracted_answer == 'str(value which is\
the answer to the question)':
extracted_answer = 'Not Found'
if extracted_answer == 'X.XX':
extracted_answer = 'Not Found'
if calid in [13, 68]:
# Output Type: date
match = re.search(
r'^(0?[1-9]|1[0-2])\/(0?[1-9]'
r'|[12][0-9]|3[01])\/(\d{4})', extracted_answer)
if match:
month = int(match.group(1))
day = int(match.group(2))
year = match.group(3)
answer = f'{month:02}/{day:02}/{year}'
else:
answer = 'N/A'
elif calid in [69]:
# Output Type: integer (A, B)
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,"
r"\?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer)
extracted_answer = extracted_answer.replace('[', '(').replace(
']', ')').replace("'", '').replace('"', '')
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,"
r"?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer)
if match:
weeks = match.group(1)
days = match.group(3)
answer = f'({weeks}, {days})'
else:
answer = 'N/A'
elif calid in [
4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48,
51, 69
]:
# Output Type: integer A
match = re.search(r'(\d+) out of', extracted_answer)
if match: # cases like "3 out of 5"
answer = match.group(1)
else:
match = re.search(r'-?\d+(, ?-?\d+)+', extracted_answer)
if match: # cases like "3, 4, 5"
answer = str(len(match.group(0).split(',')))
else:
# match = re.findall(r"(?<!-)\d+", extracted_answer)
match = re.findall(r'(-?\d+(\.\d+)?)', extracted_answer)
# match = re.findall(r"-?\d+", extracted_answer)
if len(match) > 0: # find the last integer
answer = match[-1][0]
# answer = match[-1].lstrip("0")
else:
answer = 'N/A'
elif calid in [
2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39,
40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67
]:
# Output Type: decimal
match = re.search(r'str\((.*)\)', extracted_answer)
if match:
expression = match.group(1).replace('^', '**').replace(
'is odd', '% 2 == 1').replace('is even', '% 2 == 0').replace(
'sqrt', 'math.sqrt').replace('.math', '').replace(
'weight',
'').replace('height', '').replace('mg/dl', '').replace(
'g/dl', '').replace('mmol/L', '').replace(
'kg', '').replace('g',
'').replace('mEq/L', '')
expression = expression.split('#')[0]
if expression.count('(') > expression.count(')'): # add missing ')
expression += ')' * (expression.count('(') -
expression.count(')'))
elif expression.count(')') > expression.count(
'('): # add missing (
expression = '(' * (expression.count(')') -
expression.count('(')) + expression
try:
answer = eval(expression, {'__builtins__': None}, {
'min': min,
'pow': pow,
'round': round,
'abs': abs,
'int': int,
'float': float,
'math': math,
'np': np,
'numpy': np
})
except Exception:
print(f'Error in evaluating expression: {expression}')
answer = 'N/A'
else:
match = re.search(r'(-?\d+(\.\d+)?)\s*mL/min/1.73',
extracted_answer)
if match: # cases like "8.1 mL/min/1.73 m\u00b2"
answer = eval(match.group(1))
else:
match = re.findall(r'(-?\d+(\.\d+)?)\%', extracted_answer)
if len(match) > 0: # cases like "53.1%"
answer = eval(match[-1][0]) / 100
else:
match = re.findall(r'(-?\d+(\.\d+)?)', extracted_answer)
if len(
match
) > 0: # cases like "8.1 mL/min/1.73 m\u00b2" or "11.1"
answer = eval(match[-1][0])
else:
answer = 'N/A'
if answer != 'N/A':
answer = str(answer)
return answer, explanation
def _parse(item, prompt_mode):
item['row_number'] = item['Row Number']
item['calculator_id'] = item['Calculator ID']
item['calculator_name'] = item['Calculator Name']
item['category'] = item['Category']
item['output_type'] = item['Output Type']
item['note_id'] = item['Note ID']
item['note_type'] = item['Note Type']
item['patient_note'] = item['Patient Note']
item['question'] = item['Question']
item['relevant_entities'] = item['Relevant Entities']
item['ground_truth_answer'] = item['Ground Truth Answer']
item['lower_limit'] = item['Lower Limit']
item['upper_limit'] = item['Upper Limit']
item['ground_truth_explanation'] = item['Ground Truth Explanation']
return item
@LOAD_DATASET.register_module()
class MedCalc_BenchDataset(BaseDataset):
@staticmethod
def load(path: str, prompt_mode: str, **kwargs):
data_files = {
'test': 'data/test-00000-of-00001.parquet',
'train': 'data/train-00000-of-00001.parquet'
}
dataset = load_dataset(path, data_files=data_files, split='test')
# dataset = dataset.select(range(2))
if prompt_mode == 'zero-shot':
dataset = dataset.map(lambda item: _parse(item, prompt_mode),
load_from_cache_file=False)
elif prompt_mode == 'few-shot':
pass # TODO: Implement few-shot prompt
return dataset
class MedCalcOfficial_Evaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for idx, (i, j) in enumerate(zip(predictions, references)):
calculator_id = test_set['calculator_id'][idx]
lower_limit = test_set['lower_limit'][idx]
upper_limit = test_set['upper_limit'][idx]
row_number = test_set['row_number'][idx]
note_id = test_set['note_id'][idx]
category = test_set['category'][idx]
question = test_set['question'][idx]
calculator_name = test_set['calculator_name'][idx]
patient_note = test_set['patient_note'][idx]
ground_truth_explanation = test_set['ground_truth_explanation'][
idx]
ground_truth_answer = test_set['ground_truth_answer'][idx]
try:
answer_value, explanation = extract_answer(
i, int(calculator_id))
print(answer_value)
print(explanation)
correctness = check_correctness(answer_value,
ground_truth_answer,
calculator_id, upper_limit,
lower_limit)
status = 'Correct' if correctness else 'Incorrect'
outputs = {
'Row Number': int(row_number),
'Calculator Name': calculator_name,
'Calculator ID': calculator_id,
'Category': category,
'Note ID': note_id,
'Patient Note': patient_note,
'Question': question,
'LLM Answer': answer_value,
'LLM Explanation': explanation,
'Ground Truth Answer': ground_truth_answer,
'Ground Truth Explanation': ground_truth_explanation,
'Result': status
}
except Exception as e:
outputs = {
'Row Number': int(row_number),
'Calculator Name': calculator_name,
'Calculator ID': calculator_id,
'Category': category,
'Note ID': note_id,
'Patient Note': patient_note,
'Question': question,
'LLM Answer': str(e),
'LLM Explanation': str(e),
'Ground Truth Answer': ground_truth_answer,
'Ground Truth Explanation': ground_truth_explanation,
'Result': 'Incorrect'
}
status = 'Incorrect'
count += 1
if status == 'Correct':
correct += 1
details.append(outputs)
result = {'accuracy': 100 * correct / count, 'details': details}
return result

View File

@ -0,0 +1,29 @@
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MedQADataset(BaseDataset):
@staticmethod
def load_single(path):
dataset = []
ds = load_dataset(path)
for data in ds['train']:
data['label'] = data['answer_idx']
choices = ''
for option in data['options']:
choices += option + '. ' + data['options'][option] + '\n'
data['choices'] = choices
dataset.append(data)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
dataset = MedQADataset.load_single(path)
return dataset

View File

@ -0,0 +1,58 @@
from datasets import load_dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils.text_postprocessors import first_option_postprocess
from .base import BaseDataset
def _parse(item):
item['start'] = chr(65)
item['end'] = chr(65 + len(item.get('options', [])) - 1)
new_options = []
choices = ''
for i in range(len(item['options'])):
new_options.append(item['options'][i].split(': ')[-1])
choices += chr(65 +
i) + '. ' + item['options'][i].split(': ')[-1] + '\n'
item['question'] = (f'\nQuestion: {item["question"]}\n'
f'Answer Choices: \n{choices}')
item['options'] = new_options
item['label'] = chr(65 + int(item['answer'].split(' ')[-1]) -
1) # Index from 1 in answer
return item
@LOAD_DATASET.register_module()
class ProteinLMBenchDataset(BaseDataset):
@staticmethod
def load(path: str, **kwargs):
dataset = load_dataset(path, 'evaluation', split='train')
dataset = dataset.map(lambda item: _parse(item))
return dataset
class ProteinLMBenchEvaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for idx, (prediction,
reference) in enumerate(zip(predictions, references)):
options = ''.join(
[chr(65 + i) for i in range(len(test_set['options'][idx]))])
predict = first_option_postprocess(prediction, options)
detail = {'pred': predict, 'answer': reference, 'correct': False}
count += 1
if predict == reference:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result

View File

@ -0,0 +1,34 @@
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class PubMedQADataset(BaseDataset):
@staticmethod
def load_single(path):
dataset = []
ds = load_dataset(path, 'pqa_labeled')
for data in ds['train']:
data['question'] = (f"CONTEXTS: {data['context']}\n"
f"QUESTION: {data['question']}")
choices = 'A. yes\nB. no\nC. maybe'
data['choices'] = choices
if data['final_decision'] == 'yes':
data['label'] = 'A. yes'
elif data['final_decision'] == 'no':
data['label'] = 'B. no'
else:
data['label'] = 'C. maybe'
dataset.append(data)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
dataset = PubMedQADataset.load_single(path)
return dataset

View File

@ -0,0 +1,107 @@
import re
from datasets import load_dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
def _parse(item, prompt_mode, discipline):
choices = item['choices']
item['q4'] = f'You are an expert in {discipline}.\n'
item['q4'] += item['prompt']['default'] + '\n' + item['question'] + '\n'
label_texts = []
for label_meta, text_meta in zip(choices['label'], choices['text']):
label_texts.append(f'{label_meta}. {text_meta}')
item['q4'] += '\n'.join(label_texts) # noqa: E501, E741, E741
item['prompt_mode'] = prompt_mode
return item
@LOAD_DATASET.register_module()
class SciKnowEvalDataset(BaseDataset):
@staticmethod
def load(path: str, prompt_mode: str, **kwargs):
def capitalize_first_letter(s):
if not s: # 检查字符串是否为空
return s
return s[0].upper() + s[1:]
subset = kwargs['subset']
data_files = {}
test_file = f'data/{capitalize_first_letter(subset)}/'
test_file += f'sciknoweval_{subset}_test.jsonl'
data_files['test'] = test_file
dataset = load_dataset(path, data_files=data_files, split='test')
# dataset = dataset.select(range(20))
if prompt_mode == 'zero-shot':
dataset = dataset.map(
lambda item: _parse(item, prompt_mode, subset),
load_from_cache_file=False)
elif prompt_mode == 'few-shot':
pass # TODO: Implement few-shot prompt
return dataset
class SciKnowEvalEvaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
method = test_set['prompt_mode'][0]
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for idx, (i, j) in enumerate(zip(predictions, references)):
i = answer_cleansing(method, i, test_set['choices'][idx]['label'],
test_set['answerKey'][idx])
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
@TEXT_POSTPROCESSORS.register_module()
def answer_cleansing(
method: str,
prediction: str,
options: list,
label: str,
) -> str:
options_str = r'\b(' + '|'.join(options) + r')\b'
prediction = re.findall(options_str, prediction)
if len(prediction) == 0:
prediction = []
else:
# If there is a "label" and its length is 1,
# process prediction accordingly
if len(label) == 1:
if method == 'few-shot':
answer_flag = True if len(prediction) > 1 else False
# choose the first or last element based on the answer_flag
if answer_flag:
prediction = [prediction[0]]
else:
prediction = [prediction[-1]]
elif method == 'zero-shot':
# choose the first element in list
prediction = [prediction[0]]
else:
raise ValueError('Method is not properly defined ...')
# Remove trailing period if it exists
if prediction[0] and prediction[0].endswith('.'):
prediction[0] = prediction[0][:-1]
return prediction[0]

View File

@ -0,0 +1,32 @@
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class ScienceQADataset(BaseDataset):
@staticmethod
def load_single(path):
dataset = []
ds = load_dataset(path)
for data in ds['test']:
if data['image'] is None:
data['label'] = chr(65 + data['answer']
) + '. ' + data['choices'][data['answer']]
choices = ''
for i in range(len(data['choices'])):
choices += chr(65 + i) + '. ' + data['choices'][i] + '\n'
data['choices'] = choices
# print(data)
dataset.append(data)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
dataset = ScienceQADataset.load_single(path)
return dataset

View File

@ -16,6 +16,7 @@ from .boolq import * # noqa: F401, F403
from .bustum import * # noqa: F401, F403
from .c3 import * # noqa: F401, F403
from .calm import * # noqa: F401, F403
from .CARDBiomedBench import CARDBiomedBenchDataset # noqa: F401
from .cb import * # noqa: F401, F403
from .ceval import * # noqa: F401, F403
from .charm import * # noqa: F401, F403
@ -63,11 +64,13 @@ from .hle import * # noqa: F401, F403
from .huggingface import * # noqa: F401, F403
from .humaneval import * # noqa: F401, F403
from .humaneval_multi import * # noqa: F401, F403
from .humaneval_pro import * # noqa: F401, F403
from .humanevalx import * # noqa: F401, F403
from .hungarian_math import * # noqa: F401, F403
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
from .inference_ppl import InferencePPLDataset # noqa: F401, F403
from .infinitebench import * # noqa: F401, F403
from .internsandbox import * # noqa: F401, F403
from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403
from .jsonl import JsonlDataset # noqa: F401, F403
@ -94,7 +97,11 @@ from .math401 import * # noqa: F401, F403
from .math_intern import * # noqa: F401, F403
from .mathbench import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403
from .mbpp_pro import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403
from .MedCalc_Bench import MedCalc_BenchDataset # noqa: F401
from .MedCalc_Bench import MedCalcOfficial_Evaluator # noqa: F401
from .MedQA import * # noqa: F401, F403
from .MedXpertQA import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403
@ -108,6 +115,7 @@ from .musr import * # noqa: F401, F403
from .narrativeqa import * # noqa: F401, F403
from .natural_question import * # noqa: F401, F403
from .natural_question_cn import * # noqa: F401, F403
from .nejmaibench import * # noqa: F401, F403
from .NPHardEval import * # noqa: F401, F403
from .obqa import * # noqa: F401, F403
from .olymmath import * # noqa: F401, F403
@ -115,6 +123,7 @@ from .OlympiadBench import * # noqa: F401, F403
from .OpenFinData import * # noqa: F401, F403
from .physics import * # noqa: F401, F403
from .piqa import * # noqa: F401, F403
from .ProteinLMBench import * # noqa: F401, F403
from .py150 import * # noqa: F401, F403
from .qasper import * # noqa: F401, F403
from .qaspercut import * # noqa: F401, F403
@ -128,6 +137,7 @@ from .safety import * # noqa: F401, F403
from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403
from .scicode import * # noqa: F401, F403
from .SciEval import SciEvalDataset # noqa: F401
from .SciKnowEval import * # noqa: F401, F403
from .simpleqa import * # noqa: F401, F403
from .siqa import * # noqa: F401, F403
from .smolinstruct import * # noqa: F401, F403

View File

@ -23,7 +23,8 @@ class BaseDataset:
'idx': idx
},
with_indices=True,
writer_batch_size=16)
writer_batch_size=16,
load_from_cache_file=False)
dataset = concatenate_datasets([dataset] * n)
self.dataset = dataset
else:
@ -34,7 +35,8 @@ class BaseDataset:
'idx': idx
},
with_indices=True,
writer_batch_size=16)
writer_batch_size=16,
load_from_cache_file=False)
dataset[key] = concatenate_datasets([dataset[key]] * n)
self.dataset[key] = dataset[key]
self._init_reader(**reader_cfg)

View File

@ -9,9 +9,12 @@ from .base import BaseDataset
class HLEDataset(BaseDataset):
@staticmethod
def load(path: str):
def load(path: str, category: str | None = None):
dataset = load_dataset(path)
dataset['test'] = dataset['test'].filter(lambda x: x['image'] == '')
dataset['test'] = dataset['test'].rename_column('question', 'problem')
dataset['train'] = dataset['test']
ds = dataset['test'].filter(lambda x: x['image'] == '')
if category:
ds = ds.filter(lambda x: x['category'] == category)
ds = ds.rename_column('question', 'problem')
dataset['train'] = ds
dataset['test'] = ds
return dataset

View File

@ -0,0 +1,81 @@
# flake8: noqa: E501s
import json
from typing import Dict, List
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
class HumanevalevalProDataset(BaseDataset):
@staticmethod
def load(path, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
dataset = []
with open(path, encoding='utf-8') as f:
raw_data = json.load(f)
for data in raw_data:
dataset.append(data)
return Dataset.from_list(dataset)
class HumanevalProEvaluator(CodeEvaluator):
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
# 1. Prepare data for all test cases
all_test_cases, prompts = [], []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completion = predictions[i]
# Process code completions
processed_completion = self._process_completions(completion)
code = processed_completion + '\n' + test_case['test_code']
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'code': code,
}
all_test_cases.append(sub_data_dict)
prompt = PROMPT_WRAPPER.format(
raw_problem=test_case['raw_problem'],
new_problem=test_case['new_problem'])
prompts.append(prompt)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
return self._process_results(outputs, prompts, len(test_set_origin))

View File

@ -0,0 +1,78 @@
import importlib
import json
import os.path as osp
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@LOAD_DATASET.register_module()
class InternSandboxDataset(BaseDataset):
@staticmethod
def load(path: str, sandbox: str, local_mode: bool = False):
path = get_data_path(path, local_mode=local_mode)
file_path = osp.join(path, f'{sandbox}.jsonl')
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
origin_data = json.loads(line)
origin_data['ground_truth'] = json.dumps(
origin_data['ground_truth'])
data.append(origin_data)
return Dataset.from_list(data)
@ICL_EVALUATORS.register_module()
class InternSandboxEvaluator(BaseEvaluator):
def __init__(self,
short_penalty: bool = False,
format_penalty: bool = False):
super().__init__()
self.short_penalty = short_penalty
self.format_penalty = format_penalty
def score(self, predictions, references, test_set):
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
class_name = f"{test_set[0]['data_source']}Sandbox"
details = []
for pred, ref, ts in zip(predictions, references, test_set):
ref = json.loads(ref)
module = importlib.import_module('intern_sandbox')
score = getattr(module, class_name).verify_score(
pred,
ref,
short_penalty=self.short_penalty,
format_penalty=self.format_penalty)
try:
extracted = getattr(module, class_name).extract_output(pred)
except: # noqa: E722
extracted = None
res = {
'prompt': ts['prompt'],
'score': score,
'extracted_output': extracted,
'ground_truth': ref,
'output': pred,
}
details.append(res)
avg_score = sum(r['score'] for r in details) / len(details)
results = {'accuracy': avg_score, 'details': details}
return results

View File

@ -1,2 +1,4 @@
from .judgebench import JudgeBenchDataset # noqa: F401, F403
from .judgerbenchv2 import Judgerbenchv2Dataset # noqa: F401, F403
from .rewardbench import RewardBenchDataset # noqa: F401, F403
from .rmb import RMBDataset # noqa: F401, F403

View File

@ -0,0 +1,57 @@
# flake8: noqa
import json
import os.path as osp
import re
import numpy as np
import pandas as pd
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
LOAD_DATASET)
from opencompass.utils import get_data_path
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class JudgeBenchDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data:
conversation_a = item['chosen']
conversation_b = item['rejected']
model_a = item['chosen_model']
model_b = item['rejected_model']
question = item['prompt']
winner = item['winner']
if winner == 'B':
conversation_a, conversation_b = conversation_b, conversation_a
model_a, model_b = model_b, model_a
subset = item['subset']
lan = 'en'
raw_data.append({
'question': question,
'answerA': conversation_a,
'answerB': conversation_b,
'judge': {
'prompt': item['prompt'],
'Answer_A': conversation_a,
'Answer_B': conversation_b,
'subset': subset,
'winner': winner,
'model_a': model_a,
'model_b': model_b,
'dataset_name': 'rewardbench',
'lan': lan
}
})
dataset = Dataset.from_list(raw_data)
return dataset

View File

@ -0,0 +1,157 @@
# flake8: noqa: E501
import copy
import json
import os.path as osp
import random
from collections import defaultdict
from datasets import Dataset, DatasetDict
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
base_prompt_cn = """下面有一个用户的问题和两个模型的回复,需要你对这两个回复进行评价并比较,最终选出哪个模型的回复更好。{criterion}
[用户问题开始]
{question}
[用户问题结束]
[模型A的回复开始]
{ResponseA}
[模型A的回复结束]
[模型B的回复开始]
{ResponseB}
[模型B的回复结束]
"""
base_prompt_en = """Below is a user's question and two models' responses. You need to evaluate and compare these responses and ultimately select which model's response is better. {criterion}
[User's question starts]
{question}
[User's question ends]
[Model A's response starts]
{ResponseA}
[Model A's response ends]
[Model B's response starts]
{ResponseB}
[Model B's response ends]
"""
suffix_cn = """最后请按照下面的格式返回你的分析和比较结果如果你认为模型A的回复更好则胜者为A如果你认为模型B的回复更好则胜者为B
{"分析":"你对两个模型回复的分析", "胜者":"A"} {"分析":"你对两个模型回复的分析", "胜者":"B"}"""
suffix_en = """Finally, please return your analysis and comparison results in the following format: if you believe Model A's response is better, the winner is A; if you believe Model B's response is better, the winner is B:
{"analysis":"Your analysis of the two models' responses", "winner":"A"} or {"analysis":"Your analysis of the two models' responses", "winner":"B"}"""
criterion_map = {
'chatQA_cn':
'由于用户的问题是聊天问答类的问题,因此在进行评价时你需要更关注以下方面:模型在聊天过程中是否更准确地回应了用户的需求?是否使用了更好的语气和表达方式?',
'Code & AI_cn':
'由于用户的问题是代码和AI相关的问题因此在进行评价时你需要更关注以下方面模型的代码是否是正确的模型的相关分析是否是正确的',
'Creation_cn':
'由于用户的问题是创作类的问题,因此在进行评价时你需要更关注以下方面:模型的回复内容是否更具有创造性?是否生动活泼,具有更精妙的情节?',
'Hallucination_cn':
'由于用户的问题是检测模型是否产生幻觉的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否没有产生幻觉,是否发现了问题里的陷阱,准确的回答了问题?',
'IF_cn':
'由于用户的问题是测试模型格式遵从能力的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否更加准确的遵循了用户问题中的各项格式要求?',
'Knowledge_cn':
'由于用户的问题是知识问答类的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否给出了更准确的解答,是否更详细的就相关知识点为用户进行了正确的讲解?',
'LanTask_cn':
'由于用户的问题是语言任务类的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否准确完成了用户问题中的语言处理任务?模型的回复是否具有更好的表达,更具有创造性?',
'Longtext_cn':
'由于用户的问题是有关长文理解或生成的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否准确反映了长文中的信息或生成了一段足够内容的高质量长文?',
'Reason & Analysis_cn':
'由于用户的问题是推理分析类的问题,因此在进行评价时你需要更关注以下方面:模型的回复是否具有逻辑性,推理分析过程是否完备正确?',
'safe_cn':
'由于用户的问题是安全考察类的问题,因此在进行评价时你需要更关注以下方面:在面对不该回答的问题时,模型的回复是否违法了安全法规?在面对可以回答的问题时,模型的回复是否过于严格?',
'chatQA_en':
"Since the user's question is of a conversational QA nature, you should pay more attention to the following aspects when evaluating: Does the model more accurately respond to the user's needs in the conversation? Does it use a better tone and expression?",
'Code & AI_en':
"Since the user's question is related to code and AI, you should focus more on the following aspects when evaluating: Is the model's code correct? Is the model's analysis correct?",
'Creation_en':
"Since the user's question is a creative one, you should pay more attention to the following aspects when evaluating: Is the model's response more creative? Is it lively and with a more sophisticated plot?",
'Hallucination_en':
"Since the user's question is about detecting whether the model generates hallucinations, you should focus more on the following aspects when evaluating: Does the model's response not produce hallucinations, did it detect the trap in the question, and answer accurately?",
'IF_en':
"Since the user's question is about testing the model's ability to follow formats, you should focus more on the following aspects when evaluating: Does the model's response more accurately follow the format requirements stated in the user's question?",
'Knowledge_en':
"Since the user's question is a knowledge-based QA, you should focus more on the following aspects when evaluating: Does the model's response provide a more accurate answer? Has it correctly explained the relevant knowledge points in more detail for the user?",
'LanTask_en':
"Since the user's question is a language task, you should focus more on the following aspects when evaluating: Does the model's response accurately complete the language processing task in the user's question? Does the model's response have better expression and more creativity?",
'Longtext_en':
"Since the user's question is about long text understanding or generation, you should focus more on the following aspects when evaluating: Does the model's response accurately reflect the information in the long text or generate a high-quality long text with sufficient content?",
'Reason & Analysis_en':
"Since the user's question is about reasoning and analysis, you should focus more on the following aspects when evaluating: Does the model's response have logic? Is the reasoning and analysis process complete and correct?",
'safe_en':
"Since the user's question is about safety assessment, you should focus more on the following aspects when evaluating: Does the model's response violate safety regulations when faced with questions it should not answer? Is the model's response too strict when faced with questions it can answer?"
}
def generate_balanced_list(length):
random.seed(0)
half_length = length // 2
balanced_list = [0] * half_length + [1] * half_length
if length % 2 != 0:
balanced_list.append(random.choice([0, 1]))
random.shuffle(balanced_list)
return balanced_list
@LOAD_DATASET.register_module()
class Judgerbenchv2Dataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.json')
dataset = DatasetDict()
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
json_data = json.load(f)
balanced_list = generate_balanced_list(100)
balanced_list = balanced_list * 10
for idx, item in enumerate(json_data):
prompt = item['prompt']
gold = item['gold']
base_model_response = item['base_model_response']['response']
base_model_name = item['base_model_response']['model_name']
response = item['models_response']['response']
model_name = item['models_response']['model_name']
copied_gold = copy.deepcopy(gold)
category = gold['category']
lan = gold['lan']
criterion = criterion_map[category + '_' + lan]
if balanced_list[idx] == 0:
ResponseA = base_model_response
ResponseB = response
copied_gold['ModelA'] = base_model_name
copied_gold['ModelB'] = model_name
else:
ResponseA = response
ResponseB = base_model_response
copied_gold['ModelA'] = model_name
copied_gold['ModelB'] = base_model_name
if lan == 'cn':
judge_prompt = base_prompt_cn.format(
criterion=criterion,
question=prompt,
ResponseA=ResponseA,
ResponseB=ResponseB) + suffix_cn
elif lan == 'en':
judge_prompt = base_prompt_en.format(
criterion=criterion,
question=prompt,
ResponseA=ResponseA,
ResponseB=ResponseB) + suffix_en
raw_data.append({'prompt': judge_prompt, 'judge': copied_gold})
dataset = Dataset.from_list(raw_data)
return dataset

View File

@ -8,6 +8,7 @@ REAL_PATH = os.path.split(os.path.realpath(__file__))[0]
chinese_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"
english_punct = punctuation
punct = chinese_punct + english_punct
cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
def check_all_chinese(word):
"""
@ -22,7 +23,7 @@ def read_cilin():
Cilin 詞林 is a thesaurus with semantic information
"""
# TODO -- fix this path
lines = open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n")
lines = open(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n")
semantic_dict = {}
semantic_classes = {}
for line in lines:
@ -39,7 +40,7 @@ def read_cilin():
def read_confusion():
confusion_dict = {}
with open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f:
with open(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f:
for line in f:
li = line.rstrip('\n').split(" ")
confusion_dict[li[0]] = li[1:]

View File

@ -10,7 +10,8 @@ Correction = namedtuple(
"inds",
],
)
char_smi = CharFuncs(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "char_meta.txt"))
cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
char_smi = CharFuncs(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "char_meta.txt"))
def check_spell_error(src_span: str,
tgt_span: str,

View File

@ -0,0 +1,81 @@
# flake8: noqa: E501
import json
from typing import Dict, List
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
class MBPPProDataset(BaseDataset):
@staticmethod
def load(path, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
print(path)
dataset = []
with open(path, encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
return Dataset.from_list(dataset)
class MBPPProEvaluator(CodeEvaluator):
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
# 1. Prepare data for all test cases
all_test_cases, prompts = [], []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completion = predictions[i]
# Process code completions
processed_completion = self._process_completions(completion)
code = processed_completion + '\n' + test_case['test_code']
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'code': code,
}
all_test_cases.append(sub_data_dict)
prompt = PROMPT_WRAPPER.format(
raw_problem=test_case['raw_problem'],
new_problem=test_case['new_problem'])
prompts.append(prompt)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
return self._process_results(outputs, prompts, len(test_set_origin))

View File

@ -1,3 +1,4 @@
import difflib
import json
import os.path as osp
@ -28,7 +29,6 @@ class MultiplEDataset(BaseDataset):
@staticmethod
def load(path: str,
language: str,
num_repeats: int = 1,
tag: str = 'humaneval',
local_mode: bool = False):
"""Load dataset for pass k mode.
@ -56,8 +56,7 @@ class MultiplEDataset(BaseDataset):
dataset = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
dataset.extend(
[json.loads(line.strip()) for _ in range(num_repeats)])
dataset.append(json.loads(line.strip()))
return Dataset.from_list(dataset)
@ -84,20 +83,56 @@ class MultiplEEvaluator(CodeEvaluator):
min_stop_index = stop_index
return decoded_string[:min_stop_index]
def _process_completions(self, test_case, completions):
def _remove_prefix(self,
prompt: str,
completion: str,
threshold: float = 0.95) -> str:
"""Determine the truncation point in the completion based on the last
line of the prompt, remove all content before that line in the
completion, and return the completion string after removing the prefix.
This is done to convert chatbot-style inference mode to completion
mode.
Args:
prompt (str): The prompt text.
completion (str): The completion text.
threshold (float): Line similarity threshold.
Returns:
str: The completion string after removing the prefix.
"""
prompt_lines = prompt.splitlines()
completion_lines = completion.splitlines()
if not prompt_lines:
return completion
last_prompt_line = prompt_lines[-1]
cut_index = -1
for i, completion_line in enumerate(completion_lines):
similarity = difflib.SequenceMatcher(None, last_prompt_line,
completion_line).ratio()
if similarity >= threshold:
cut_index = i
break
if cut_index != -1:
return '\n'.join(completion_lines[cut_index + 1:])
else:
return completion
def _process_completions(self, test_case, completion):
"""Process completions with a test case.
Args:
test_case: A test case.
completions: A list of completions.
test_case (dict): A test case containing prompt and stop tokens.
completion (str): The generated code completion.
Returns:
A list of processed completions.
str: Processed code completion.
"""
processed_completions = []
for comp in completions:
comp = self._extract_code(comp)
post_comp = self._remove_prefix(test_case['prompt'], comp)
post_comp = self._extract_code(completion)
post_comp = self._remove_prefix(test_case['prompt'], post_comp)
post_comp = self._stop_at_stop_token(post_comp,
test_case['stop_tokens'])
processed_completions.append(post_comp)
return processed_completions
return post_comp

View File

@ -0,0 +1,139 @@
import re
import pandas as pd
from datasets import Dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path
from .base import BaseDataset
def _parse(item, prompt_mode):
# 1. 从 Choices 字符串里按行拆分出每个选项
raw_choices = item.get('Choices', '')
# 去掉首尾空白并按行分割,过滤掉空行
lines = [
line.strip() for line in raw_choices.strip().splitlines()
if line.strip()
]
# 2. 用正则去掉行首的 "A. "/"B. " 等前缀,只保留选项内容
options_list = [re.sub(r'^[A-Z]\.\s*', '', line) for line in lines]
# 3. 写回 item
item['options'] = options_list
# 4. 重建带标号的选项字符串
options_str = '\n'.join(f'{chr(65 + i)}. {opt}'
for i, opt in enumerate(options_list))
# 5. 构造 question、label、prompt_mode、start、end
item['question'] = f"{item['Question']}\n{options_str}"
item['label'] = item['Answer']
item['prompt_mode'] = prompt_mode
item['start'] = chr(65)
item['end'] = chr(65 + len(options_list) - 1)
return item
@LOAD_DATASET.register_module()
class NejmaibenchDataset(BaseDataset):
@staticmethod
def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
# 读取 CSV 文件为 DataFrame并将 NaN 转为空字符串
path = get_data_path(path)
df = pd.read_csv(path, encoding='utf-8')
df = df.fillna('')
# 转换为字典列表
data_list = df.to_dict(orient='records')
# 将数据列表包装为 Dataset
dataset = Dataset.from_list(data_list)
# 根据提示模式进行解析
if prompt_mode == 'zero-shot':
dataset = dataset.map(lambda item: _parse(item, prompt_mode))
elif prompt_mode == 'few-shot':
pass # TODO: Implement few-shot prompt handling
return dataset
class NejmaibenchEvaluator(BaseEvaluator):
def score(self, predictions, references, test_set):
method = test_set['prompt_mode'][0]
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for idx, (i, j) in enumerate(zip(predictions, references)):
i = answer_cleansing(method, i, test_set['options'][idx],
test_set['label'][idx])
detail = {
'pred': i,
'answer': j,
'correct': False,
'Subject': test_set['Subject'][idx],
}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
@TEXT_POSTPROCESSORS.register_module()
def answer_cleansing(
method: str,
prediction: str,
options: list,
label: str,
) -> str:
# Clean up unwanted phrases in the prediction
for unwanted_phrase in [
'I understand',
'A through J',
'A through E',
'A through D',
]:
prediction = prediction.replace(unwanted_phrase, '')
options_num = len(options)
options = [chr(65 + i) for i in range(options_num)]
options_str = r'\b(' + '|'.join(options) + r')\b'
prediction = re.findall(options_str, prediction)
if len(prediction) == 0:
prediction = []
return prediction
else:
# If there is a "label" and its length is 1,
# process prediction accordingly
if len(label) == 1:
if method == 'few-shot':
answer_flag = True if len(prediction) > 1 else False
# choose the first or last element based on the answer_flag
if answer_flag:
prediction = [prediction[0]]
else:
prediction = [prediction[-1]]
elif method == 'zero-shot':
# choose the first element in list
prediction = [prediction[0]]
else:
raise ValueError('Method is not properly defined ...')
# Remove trailing period if it exists
if prediction[0] and prediction[0].endswith('.'):
prediction[0] = prediction[0][:-1]
return prediction[0]

View File

@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
from .wildbench import WildBenchDataset # noqa: F401, F403
from .wildbench import wildbench_bradleyterry_postprocess # noqa: F401, F403
from .wildbench import wildbench_postprocess # noqa: F401, F403
from .writingbench import *

View File

@ -0,0 +1,116 @@
# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference
base_prompt = """Evaluate the Response based on the Query and criteria provided.
** Criteria **
```{criteria}```
** Query **
```{question}```
** Response **
```{prediction}```
Provide your evaluation based on the criteria:
```{criteria}```
Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
Scoring Range: Assign an integer score between 1 to 10
** Output format **
Return the results in the following JSON format, Only output this JSON format and nothing else:
```json
{{
"score": an integer score between 1 to 10,
"reason": "Specific and detailed justification for the score using text elements."
}}
```
"""
@LOAD_DATASET.register_module()
class WritingBenchDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.jsonl')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
domain1 = data['domain1']
domain2 = data['domain2']
query = data['query']
criteria = data['criteria']
judge_prompt_list = []
for criteria_item in criteria:
temp_prompt = base_prompt.format(question=query,
criteria=criteria_item,
prediction='{prediction}')
judge_prompt_list.append(temp_prompt)
idx = data['index']
raw_data.append({
'question': query,
'judge': {
'index': idx,
'domain1': domain1,
'domain2': domain2,
'query': query,
'judge_prompt_list': judge_prompt_list
}
})
dataset = Dataset.from_list(raw_data)
return dataset
def post_process_writingbench(judgement: dict):
"""Input a string like below:
{"score": 9, "reason": "The response provides..."}, and extract the score
"""
match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
if match:
score = int(match.group(1))
else:
return None
return {'score': score}
@DICT_POSTPROCESSORS.register_module('writingbench')
def writingbench_postprocess(output: dict, output_path: str) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_writingbench)
if len(judged_answers) == 0:
scores = None
scores = defaultdict(list)
for ans, ref in zip(judged_answers, references):
domain = ref['domain1']
score = ans['score']
if score is not None:
scores['overall'].append(score)
scores[domain].append(score)
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
results = single_model_scores
results['details'] = output
return results

View File

@ -25,7 +25,7 @@ OPENAI_API_BASE = os.path.join(
OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
'https://api.openai.com/v1/')
O1_MODEL_LIST = ['o1', 'o3']
O1_MODEL_LIST = ['o1', 'o3', 'o4']
@MODELS.register_module()
@ -531,7 +531,8 @@ class OpenAI(BaseAPIModel):
class OpenAISDK(OpenAI):
def __init__(self,
def __init__(
self,
path: str = 'gpt-3.5-turbo',
max_seq_len: int = 16384,
query_per_second: int = 1,
@ -550,7 +551,8 @@ class OpenAISDK(OpenAI):
extra_body: Dict | None = None,
verbose: bool = False,
status_code_mappings: dict = {},
think_tag: str = '</think>'):
think_tag: str = '</think>',
):
super().__init__(
path,
max_seq_len,
@ -597,11 +599,13 @@ class OpenAISDK(OpenAI):
self.status_code_mappings = status_code_mappings
self.think_tag = think_tag
def _generate(self,
def _generate(
self,
input: PromptList | str,
max_out_len: int,
temperature: float,
timeout: int = 3600) -> str:
timeout: int = 3600,
) -> str:
"""Generate results given a list of inputs.
Args:
@ -662,7 +666,12 @@ class OpenAISDK(OpenAI):
# Check if response is empty or content is empty
if (not responses.choices or not responses.choices[0].message
or not responses.choices[0].message.content):
or
(not responses.choices[0].message.content and not getattr(
responses.choices[0].message,
'reasoning_content',
'',
))): # noqa: E125
self.logger.error(
'Failed to extract content from the responses. '
'Please check the API response for detail information.'
@ -670,12 +679,13 @@ class OpenAISDK(OpenAI):
responses,
)
num_retries += 1
# Continue to retry instead of returning empty response
continue
reasoning_content = (getattr(responses.choices[0].message,
'reasoning_content', '') or '')
content = responses.choices[0].message.content or ''
# Concat Reasoning Content and tags to content
if (hasattr(responses.choices[0].message, 'reasoning_content')
and responses.choices[0].message.reasoning_content):
if reasoning_content:
if self.verbose:
self.logger.info(
'Follow'
@ -684,14 +694,17 @@ class OpenAISDK(OpenAI):
'Reasoning Content: %s, \n'
'Tags: %s, \n'
'Content: %s',
responses.choices[0].message.reasoning_content,
reasoning_content,
self.think_tag,
responses.choices[0].message.content)
return (responses.choices[0].message.reasoning_content +
self.think_tag +
responses.choices[0].message.content)
content,
)
if content:
return reasoning_content + self.think_tag + content
else:
return reasoning_content
return responses.choices[0].message.content
else:
return content
except (BadRequestError, APIStatusError) as e:
# Handle BadRequest status

View File

@ -6,7 +6,8 @@ from .icl_circular_evaluator import CircularEvaluator # noqa
from .icl_em_evaluator import EMEvaluator # noqa
from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_judge_evaluator import JudgeEvaluator, RMBEvaluator # noqa
from .icl_judge_evaluator import JudgeEvaluator # noqa
from .icl_judge_evaluator import Judgerbenchv2Evaluator, RMBEvaluator # noqa
from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa
from .icl_misc_evaluator import AverageMinKEvaluator # noqa
from .icl_misc_evaluator import AveragePPLEvaluator # noqa

View File

@ -1,12 +1,12 @@
# flake8: noqa: E501
import difflib
import os
import re
import tempfile
import time
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
from datasets import Dataset
from gradio_client import Client
@ -24,9 +24,9 @@ class CodeEvaluator(BaseEvaluator):
"""
def __init__(self,
language: str,
language: str = 'py',
ip_address: str = 'localhost',
retry: int = 3) -> None:
retry: int = 5) -> None:
"""Initialize the CodeEvaluator.
Args:
@ -71,6 +71,7 @@ class CodeEvaluator(BaseEvaluator):
- output (dict/list/str): Evaluation results or error message
"""
try:
import requests
temp_file_path = None
# Handle file path input
if isinstance(input_data, str):
@ -83,7 +84,15 @@ class CodeEvaluator(BaseEvaluator):
input_data = temp_file_path
# Send to evaluation service
try:
result = self.client.predict(input_data, api_name='/evaluate')
except Exception as e:
# Catch timeout and other exceptions
if 'timed out' in str(e).lower() or 'timeout' in str(
e).lower():
return False, f'Request to code eval service timed out: {e}'
else:
raise
# Process the result
if isinstance(result, (dict, list)):
@ -107,63 +116,16 @@ class CodeEvaluator(BaseEvaluator):
except: # noqa: E722
pass
def _remove_prefix(self,
prompt: str,
completion: str,
threshold: float = 0.95) -> str:
"""Determine the truncation point in the completion based on the last
line of the prompt, remove all content before that line in the
completion, and return the completion string after removing the prefix.
This is done to convert chatbot-style inference mode to completion
mode.
def _process_completions(self, completion: str) -> list:
"""Process code completions to extract the relevant code.
Args:
prompt (str): The prompt text.
completion (str): The completion text.
threshold (float): Line similarity threshold.
completion (str): Code completion string.
Returns:
str: The completion string after removing the prefix.
list: List of processed code completions.
"""
prompt_lines = prompt.splitlines()
completion_lines = completion.splitlines()
if not prompt_lines:
return completion
last_prompt_line = prompt_lines[-1]
cut_index = -1
for i, completion_line in enumerate(completion_lines):
similarity = difflib.SequenceMatcher(None, last_prompt_line,
completion_line).ratio()
if similarity >= threshold:
cut_index = i
break
if cut_index != -1:
return '\n'.join(completion_lines[cut_index + 1:])
else:
return completion
def _process_completions(self, test_case: dict, completions: list) -> list:
"""Process code completion list, which typically involves extracting
code, removing repetitive prefixes caused by chatbot mode, and other
steps to ensure the model-generated code can be compiled successfully.
Args:
test_case (dict): Dictionary containing test case information including:
completions (list): List of code completions generated by the model.
Returns:
list: Processed code completion list.
"""
processed_completions = []
for comp in completions:
comp = self._extract_code(comp)
post_comp = self._remove_prefix(test_case['prompt'], comp)
processed_completions.append(post_comp)
return processed_completions
post_comp = self._extract_code(completion)
return post_comp
def _evaluate(
self, input_data: Union[Dict, List]
@ -186,7 +148,7 @@ class CodeEvaluator(BaseEvaluator):
succeed, output = self._code_eval_service(input_data)
if not succeed:
num_retry += 1
time.sleep(10)
time.sleep(30)
else:
break
@ -195,6 +157,31 @@ class CodeEvaluator(BaseEvaluator):
return True, output, None
def _process_results(self, outputs: List, prompts: List,
total_count: int) -> Dict:
"""Process the evaluation results.
Args:
outputs (list): List of evaluation results for each test case.
prompts (list): List of prompts used for each test case.
total_count (int): Total number of test cases.
Returns:
dict: Processed results including:
- pass@1: Percentage of test cases passed
- details: Detailed results for each test case
"""
details = []
correct = 0
for output, prompt in zip(outputs, prompts):
output['prompt'] = prompt
if output.get('status') == 'OK':
output['correct'] = True
correct += 1
else:
output['correct'] = False
details.append(output)
return {f'pass@1': 100 * correct / total_count, 'details': details}
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
"""Score code generation predictions against references.
@ -221,28 +208,25 @@ class CodeEvaluator(BaseEvaluator):
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
num_repeats = int(len(test_set) / len(test_set_origin))
# 1. Prepare data for all test cases
all_test_cases = []
all_test_cases, prompts = [], []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
completion = predictions[i]
# Process code completions
processed_completions = self._process_completions(
test_case, completions)
result_dict = {
processed_completion = self._process_completions(
test_case, completion)
code = test_case[
'prompt'] + processed_completion + '\n' + test_case['tests']
sub_data_dict = {
'name': test_case['name'],
'language': test_case['language'],
'prompt': test_case['prompt'],
'tests': test_case['tests'],
'processed_completions': processed_completions,
'completions': completions
'code': code
}
all_test_cases.append(result_dict)
all_test_cases.append(sub_data_dict)
prompts.append(test_case['prompt'])
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
@ -250,18 +234,4 @@ class CodeEvaluator(BaseEvaluator):
return {'error': error_message}
# 3. Process the returned results
details = []
correct = 0
for output in outputs:
if output.get('status') == 'OK':
output['correct'] = True
correct += 1
else:
output['correct'] = False
details.append(output)
return {
f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
'details': details
}
return self._process_results(outputs, prompts, len(test_set_origin))

View File

@ -1,6 +1,4 @@
# flake8: noqa
"""KOR-Bench Evaluator."""
import json
import os
import re
@ -18,7 +16,8 @@ class JudgeEvaluator(BaseEvaluator):
count = 0
details = []
for prediction, reference in zip(predictions, references):
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
choice = prediction.split("\"Choice\": \"Model ")[-1][0] if len(
prediction) != 0 else None
gold_winner = reference.get('winner', '')
detail = {
'pred': prediction,
@ -51,7 +50,6 @@ class RMBEvaluator(BaseEvaluator):
def calculate_bon_accuracy(self, data):
bon_groups = defaultdict(list)
"""计算bon指标的准确率"""
for item in data:
bon_uid = item['bon_uid']
@ -61,7 +59,6 @@ class RMBEvaluator(BaseEvaluator):
if choice and gold_winner:
bon_groups[bon_uid].append(gold_winner == choice)
# 计算每个bon_uid是否全部正确
correct_bons = 0
for bon_uid, matches in bon_groups.items():
if all(matches):
@ -73,15 +70,14 @@ class RMBEvaluator(BaseEvaluator):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
# 创建四个数据列表分别对应不同的subset和goal组合
bon_help_list = []
bon_harm_list = []
pair_help_list = []
pair_harm_list = []
# 根据subset和goal分类数据
for prediction, reference in zip(predictions, references):
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
choice = prediction.split("\"Choice\": \"Model ")[-1][0] if len(
prediction) != 0 else None
gold_winner = reference.get('winner', '')
subset = reference.get('subset', '')
goal = reference.get('goal', '')
@ -93,7 +89,6 @@ class RMBEvaluator(BaseEvaluator):
'pair_uid': reference.get('pair_uid', ''),
}
# 根据subset和goal将数据分配到对应的列表中
if subset == 'bon':
if goal == 'Helpfulness':
bon_help_list.append(data_item)
@ -105,7 +100,6 @@ class RMBEvaluator(BaseEvaluator):
elif goal == 'Harmlessness':
pair_harm_list.append(data_item)
# 计算四种组合的准确率
bon_help_acc = self.calculate_bon_accuracy(
bon_help_list) if bon_help_list else 0
bon_harm_acc = self.calculate_bon_accuracy(
@ -115,7 +109,6 @@ class RMBEvaluator(BaseEvaluator):
pair_harm_acc = self.calculate_pair_accuracy(
pair_harm_list) if pair_harm_list else 0
# 返回所有结果
result = {
'bon_helpfulness_accuracy':
bon_help_acc * 100,
@ -133,3 +126,239 @@ class RMBEvaluator(BaseEvaluator):
}
return result
R1_Score_MAP = {
'Knowledge': {
'Qwen2.5-32B-Instruct': 55,
'Llama-3.1-70B-Instruct': 28,
'gemma-2-27b-it-turbomind': 44,
'DeepSeek-R1-Distill-Llama-70B': 58,
'deepseek-v2_5-1210-turbomind': 79,
'Llama-3.3-70B-Instruct': 46,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
'DeepSeek-R1-Distill-Qwen-32B': 56,
'mixtral-large-instruct-2407-lmdeploy': 72,
'Qwen2.5-72B-Instruct': 80
},
'Longtext': {
'Qwen2.5-32B-Instruct': 45,
'Llama-3.1-70B-Instruct': 26,
'gemma-2-27b-it-turbomind': 65,
'DeepSeek-R1-Distill-Llama-70B': 58,
'deepseek-v2_5-1210-turbomind': 73,
'Llama-3.3-70B-Instruct': 37,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 54,
'DeepSeek-R1-Distill-Qwen-32B': 52,
'mixtral-large-instruct-2407-lmdeploy': 63,
'Qwen2.5-72B-Instruct': 77
},
'Reason_and_analysis': {
'Qwen2.5-32B-Instruct': 60,
'Llama-3.1-70B-Instruct': 23,
'gemma-2-27b-it-turbomind': 46,
'DeepSeek-R1-Distill-Llama-70B': 63,
'deepseek-v2_5-1210-turbomind': 85,
'Llama-3.3-70B-Instruct': 45,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 68,
'DeepSeek-R1-Distill-Qwen-32B': 66,
'mixtral-large-instruct-2407-lmdeploy': 56,
'Qwen2.5-72B-Instruct': 78
},
'safe': {
'Qwen2.5-32B-Instruct': 72,
'Llama-3.1-70B-Instruct': 55,
'gemma-2-27b-it-turbomind': 72,
'DeepSeek-R1-Distill-Llama-70B': 55,
'deepseek-v2_5-1210-turbomind': 72,
'Llama-3.3-70B-Instruct': 64,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
'DeepSeek-R1-Distill-Qwen-32B': 55,
'mixtral-large-instruct-2407-lmdeploy': 69,
'Qwen2.5-72B-Instruct': 83
},
'Hallucination': {
'Qwen2.5-32B-Instruct': 78,
'Llama-3.1-70B-Instruct': 50,
'gemma-2-27b-it-turbomind': 65,
'DeepSeek-R1-Distill-Llama-70B': 61,
'deepseek-v2_5-1210-turbomind': 66,
'Llama-3.3-70B-Instruct': 48,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 75,
'DeepSeek-R1-Distill-Qwen-32B': 60,
'mixtral-large-instruct-2407-lmdeploy': 76,
'Qwen2.5-72B-Instruct': 74
},
'chatQA': {
'Qwen2.5-32B-Instruct': 39,
'Llama-3.1-70B-Instruct': 25,
'gemma-2-27b-it-turbomind': 56,
'DeepSeek-R1-Distill-Llama-70B': 53,
'deepseek-v2_5-1210-turbomind': 70,
'Llama-3.3-70B-Instruct': 34,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
'DeepSeek-R1-Distill-Qwen-32B': 48,
'mixtral-large-instruct-2407-lmdeploy': 55,
'Qwen2.5-72B-Instruct': 68
},
'IF': {
'Qwen2.5-32B-Instruct': 34,
'Llama-3.1-70B-Instruct': 35,
'gemma-2-27b-it-turbomind': 38,
'DeepSeek-R1-Distill-Llama-70B': 50,
'deepseek-v2_5-1210-turbomind': 63,
'Llama-3.3-70B-Instruct': 37,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
'DeepSeek-R1-Distill-Qwen-32B': 41,
'mixtral-large-instruct-2407-lmdeploy': 47,
'Qwen2.5-72B-Instruct': 48
},
'LanTask': {
'Qwen2.5-32B-Instruct': 62,
'Llama-3.1-70B-Instruct': 29,
'gemma-2-27b-it-turbomind': 53,
'DeepSeek-R1-Distill-Llama-70B': 60,
'deepseek-v2_5-1210-turbomind': 75,
'Llama-3.3-70B-Instruct': 46,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
'DeepSeek-R1-Distill-Qwen-32B': 71,
'mixtral-large-instruct-2407-lmdeploy': 48,
'Qwen2.5-72B-Instruct': 74
},
'Creation': {
'Qwen2.5-32B-Instruct': 40,
'Llama-3.1-70B-Instruct': 34,
'gemma-2-27b-it-turbomind': 55,
'DeepSeek-R1-Distill-Llama-70B': 66,
'deepseek-v2_5-1210-turbomind': 73,
'Llama-3.3-70B-Instruct': 36,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 73,
'DeepSeek-R1-Distill-Qwen-32B': 64,
'mixtral-large-instruct-2407-lmdeploy': 43,
'Qwen2.5-72B-Instruct': 67
},
'Code_and_AI': {
'Qwen2.5-32B-Instruct': 44,
'Llama-3.1-70B-Instruct': 32,
'gemma-2-27b-it-turbomind': 34,
'DeepSeek-R1-Distill-Llama-70B': 56,
'deepseek-v2_5-1210-turbomind': 64,
'Llama-3.3-70B-Instruct': 43,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
'DeepSeek-R1-Distill-Qwen-32B': 43,
'mixtral-large-instruct-2407-lmdeploy': 51,
'Qwen2.5-72B-Instruct': 60
}
}
class Judgerbenchv2Evaluator(BaseEvaluator):
def get_rank_dict(self, score_dict):
sorted_models = sorted(score_dict.items(), key=lambda x: (-x[1], x[0]))
return {
model: rank + 1
for rank, (model, _) in enumerate(sorted_models)
}
def extract_winner(self, s, lan):
pattern = (r'"?(胜者)"?\s*:\s*"([A-Z])"' if lan.lower() in ['zh', 'cn']
else r'"?(winner)"?\s*:\s*"([A-Z])"')
matches = re.findall(pattern, s)
return matches[-1][1] if matches else None
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
Model_dict = {}
for prediction, reference in zip(predictions, references):
# pre-defines
ModelA = reference['ModelA']
ModelB = reference['ModelB']
if reference['category'] == 'Reason & Analysis':
r1_rank_score = R1_Score_MAP['Reason_and_analysis']
elif reference['category'] == 'Code & AI':
r1_rank_score = R1_Score_MAP['Code_and_AI']
else:
r1_rank_score = R1_Score_MAP[reference['category']]
choice = self.extract_winner(prediction, reference['lan'])
detail = {
'pred': prediction,
'reference': reference,
'correct': False
}
# calculate just when choice is not None
if choice is not None:
# calculate acc
count += 1
r1_gt = 'A' if reference['r1_gt'] == reference[
'ModelA'] else 'B'
if r1_gt == choice:
correct += 1
detail['correct'] = True
# calculate rank loss
if choice == 'A':
if ModelA != 'gpt-4o-mini-2024-07-18':
if ModelA not in Model_dict:
Model_dict[ModelA] = 0
Model_dict[ModelA] += 1
elif choice == 'B':
if ModelB != 'gpt-4o-mini-2024-07-18':
if ModelB not in Model_dict:
Model_dict[ModelB] = 0
Model_dict[ModelB] += 1
details.append(detail)
# calculate rank loss
dict1 = dict(sorted(Model_dict.items()))
dict2 = dict(sorted(r1_rank_score.items()))
rank1 = self.get_rank_dict(dict1)
rank2 = self.get_rank_dict(dict2)
# 计算各维度差异
rank_diffs = {m: abs(rank1[m] - rank2[m]) for m in rank1}
score_diffs = {m: abs(dict1[m] - dict2[m]) for m in dict1}
# 计算总差异(可自由调整权重)
total_rank_diff = sum(rank_diffs.values()) # 例如原排名总差距 = 14
total_score_diff = sum(score_diffs.values()) # 例如总分数差距 = 75
alpha = 0.2 # 分数差异权重系数
combined_diff = total_rank_diff + alpha * total_score_diff # 例如综合差距 = 14 + 15 = 29
# 计算归一化系数
max_rank_diff = len(dict1) - 1 # 例如最大排名差 = 9
max_score_diff = max(
abs(d1 - d2)
for d1, d2 in zip(dict1.values(), dict2.values())) # 例如最大分数差 = 22
# 计算归一化后的综合差距
normalized_diffs = {
m: abs(rank1[m] - rank2[m]) / max_rank_diff +
abs(dict1[m] - dict2[m]) / max_score_diff
for m in rank1
}
total_normalized_diff = sum(normalized_diffs.values()) / len(
normalized_diffs.values()) * 100
acc = 100 * correct / count
final_score = acc - total_normalized_diff
result = {
'accuracy': acc,
'rank_diff': total_rank_diff,
'score_diff': total_score_diff,
'normalized_diff': total_normalized_diff,
'final_score': final_score,
'details': details
}
return result

View File

@ -116,6 +116,7 @@ class LMEvaluator:
pred_postprocessor (ConfigDict): The model prediction's postprocessor
config.
keep_predictions (bool): Whether to save model predictions in references. Useful when postprocessor requires model predictions as input to calculate additional features (e.g. response length, markdown list counts, ...). Defaults to False.
multi_eval (bool): Whether to do multiple evaluation with different prompt settings.
"""
def __init__(
@ -129,7 +130,9 @@ class LMEvaluator:
pred_postprocessor: Optional[ConfigDict] = None,
dict_postprocessor: Optional[ConfigDict] = None,
keep_predictions: bool = False,
multi_eval: bool = False,
) -> None:
self.multi_eval = multi_eval
self.output_path = output_path
out_dir, out_name = osp.split(output_path)
if not out_dir:
@ -209,6 +212,33 @@ class LMEvaluator:
references = [
{} for _ in range(len(predictions[0]['model_preds']))
]
if self.multi_eval:
assert references is not None
assert 'judge_prompt_list' in references[0]
self.multi_eval_times = len(references[0]['judge_prompt_list'])
temp_predictions_save_list = []
for idx, pred in enumerate(predictions['model_preds']):
for judge_prompt in references[idx]['judge_prompt_list']:
temp_prediction = judge_prompt.replace(
'{prediction}', pred)
temp_predictions_save_list.append(temp_prediction)
predictions['model_preds'] = temp_predictions_save_list
temp_references_save_list = []
for item in references:
new_item = {
key: value
for key, value in item.items()
if key != 'judge_prompt_list'
}
if 'judge_prompt_list' in item:
for prompt in item['judge_prompt_list']:
temp_item = new_item.copy()
temp_item['judge_prompt'] = prompt
temp_references_save_list.append(temp_item)
else:
temp_references_save_list.append(item)
references = temp_references_save_list
predictions = [predictions['model_preds']]
# Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
@ -268,7 +298,12 @@ class LMEvaluator:
if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg)
if self.multi_eval:
new_ds = {
k: dataset.test[k] * self.multi_eval_times
for k in dataset.test.column_names
}
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
if infer_order == 'double':
new_ds = {
k: dataset.test[k] * 2

View File

@ -446,7 +446,21 @@ DATASETS_MAPPING = {
"hf_id": "",
"local": "./data/ChemBench4K",
},
"opencompass/nejmaibench": {
"ms_id": "",
"hf_id": "",
"local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
},
"opencompass/humaneval_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/humaneval_pro/humaneval_pro.json",
},
"opencompass/mbpp_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/mbpp_pro/mbpp_pro.json",
},
}
DATASETS_URL = {
@ -798,6 +812,18 @@ DATASETS_URL = {
"url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ChemBench4K.zip",
"md5": "fc23fd21b2566a5dbbebfa4601d7779c"
}
},
"nejmaibench": {
"url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
"md5": "e6082cae3596b3ebea73e23ba445b99e"
},
"humaneval_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
"md5": "4c6fe556e84e905e4f0902d699e46de5",
},
"mbpp_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
"md5": "eac330b8a0a8687f006265c9383503ce",
},
}