mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
from mmengine.config import read_base
|
|
|
|
with read_base():
|
|
from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
|
|
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI, TurboMindModelwithChatTemplate
|
|
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
|
from opencompass.runners import LocalRunner
|
|
from opencompass.runners import SlurmSequentialRunner
|
|
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
|
|
|
|
|
api_meta_template = dict(
|
|
round=[
|
|
dict(role='HUMAN', api_role='HUMAN'),
|
|
dict(role='BOT', api_role='BOT', generate=True),
|
|
]
|
|
)
|
|
|
|
# -------------Inference Stage ----------------------------------------
|
|
# For subjective evaluation, we often set do sample for models
|
|
models = [
|
|
dict(
|
|
type=TurboMindModelwithChatTemplate,
|
|
abbr='Qwen2-7B',
|
|
path='Qwen/Qwen2-7B-Instruct',
|
|
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
|
max_seq_len=16384,
|
|
max_out_len=2048,
|
|
batch_size=16,
|
|
run_cfg=dict(num_gpus=1),
|
|
)
|
|
]
|
|
|
|
|
|
datasets = judgerbench_datasets
|
|
|
|
|
|
infer = dict(
|
|
partitioner=dict(type=NaivePartitioner),
|
|
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
|
|
)
|
|
# -------------Evalation Stage ----------------------------------------
|
|
|
|
|
|
## ------------- Evaluation Configuration
|
|
eval = dict(
|
|
partitioner=dict(type=NaivePartitioner, n=10,),
|
|
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)),
|
|
)
|
|
|
|
work_dir = 'outputs/judgerbench/'
|