Upload HelloBench (#1607)

* upload hellobench

* update hellobench

* update readme.md

* update eval_hellobench.py

* update lastest

---------

Co-authored-by: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
This commit is contained in:
Haoran Que 2024-10-15 17:11:37 +08:00 committed by GitHub
parent fa54aa62f6
commit 4fe251729b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 535 additions and 0 deletions

View File

@ -0,0 +1,96 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.hellobench.hellobench import hellobench_datasets
from opencompass.models import HuggingFacewithChatTemplate, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import DefaultSubjectiveSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
# make sure your models' generation parameters are set properly, for example, if you set temperature=0.8, make sure you set all models' temperature to 0.8
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='glm-4-9b-chat-hf',
path='THUDM/glm-4-9b-chat',
max_out_len=16384,
generation_kwargs=dict(
temperature=0.8,
do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
batch_size=1,
run_cfg=dict(num_gpus=2, num_procs=1),
stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
)
]
datasets = [*hellobench_datasets] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
# ------------- JudgeLLM Configuration
# we recommand to use gpt4o-mini as the judge model
# if you want to use open-source LLMs as judge models, you can uncomment the following code
# judge_models = [
# dict(
# type=HuggingFacewithChatTemplate,
# abbr='glm-4-9b-chat-hf',
# path='THUDM/glm-4-9b-chat',
# max_out_len=16384,
# generation_kwargs=dict(
# temperature=0.8,
# do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
# ),
# model_kwargs=dict(
# device_map='auto',
# trust_remote_code=True,
# ),
# batch_size=1,
# run_cfg=dict(num_gpus=2, num_procs=1),
# stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
# )
# ]
judge_models = [dict(
abbr='GPT4o',
type=OpenAI,
path='gpt-4o',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=4096,
batch_size=1,
temperature=0.8,
seed=42,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=DefaultSubjectiveSummarizer)
work_dir = 'outputs/hellobench/'

View File

@ -0,0 +1,51 @@
# Guideline for evaluating HelloBench on Diverse LLMs
HelloBench is a comprehenvise, in-the-wild, and open-ended benchmark to evaluate LLMs' performance in generating long text. More details could be found in [🌐Github Repo](https://github.com/Quehry/HelloBench) and [📖Paper](https://arxiv.org/abs/2409.16191).
## Detailed instructions to evalute HelloBench in Opencompass
1. Git clone Opencompass
```shell
cd ~
git clone git@github.com:open-compass/opencompass.git
cd opencompass
```
2. Download HelloBench data in [Google Drive Url](https://drive.google.com/file/d/1EJTmMFgCs2pDy9l0wB5idvp3XzjYEsi9/view?usp=sharing), unzip it and put it in the following path(OPENCOMPASS_PATH/data/HelloBench), make sure you get path like this:
```
~/opencompass/data/
└── HelloBench
├── chat.jsonl
├── heuristic_text_generation.jsonl
├── length_constrained_data
│ ├── heuristic_text_generation_16k.jsonl
│ ├── heuristic_text_generation_2k.jsonl
│ ├── heuristic_text_generation_4k.jsonl
│ └── heuristic_text_generation_8k.jsonl
├── open_ended_qa.jsonl
├── summarization.jsonl
└── text_completion.jsonl
```
3. Setup your opencompass
```
cd ~/opencompass
pip install -e .
```
4. configuration your launch in configs/eval_hellobench.py
- set your models to be evaluated
- set your judge model (we recommend to use gpt4o-mini)
5. launch it!
```
python run.py configs/eval_hellobench.py
```
6. After that, you could find the results in outputs/hellobench/xxx/summary

View File

@ -0,0 +1,111 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import HelloBenchDataset, hellobench_postprocess
system_prompt = """You are a helpful evaluator. Your task is to evaluate the checklists of the responses given by the Large Language Models (LLMs) based on user instructions. These checklists consist of yes or no questions."""
user_prompt = """Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Here is the instruction:
{{\"instruction\": {instruction}}}
Here is the response given by LLM:
{{\"response\": {prediction}}}
Since the response may be rather long, I am specifically reminding you here that the response has ended.
Here are checklists of this instruction:
{{\"checklists\": {formatted_checklists}}}
To further remind you, I will repeat my requirements:
Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Always provide the reason for your evaluation results. You should be strict but fair in your evaluation. A score of 1 means that the response perfectly meets all the checklist requirements and you think there are really no room for improvements. When giving a score of 1, you need to carefully consider whether this checklist has been perfectly satisfied.
Evaluate all the checklists and return the evaluation results of the checklists. Output a Python List consisting of the Python Dictionary formatted as follows:
[{{\"checklist_id\": \"the id of the checklist\", \"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}},{{\"checklist_id\": \"the id of the checklist\",
\"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}}]
There are total {num_checklist} checklists that you need to evaluate. The length of the output list is equal to the number of checklists and you should give an evaluation score for each checklist. You shoule be very very very strict to the evalution to further compare the responses from different models. Your response must be a valid Python List and should contain nothing else, as it will be directly executed in Python."""
subjective_reader_cfg = dict(
input_columns=['instruction', 'formatted_checklists', 'num_checklist'],
output_column='judgement',
)
hellobench_categories = [
'open_ended_qa',
'summarization',
'chat',
'text_completion',
'heuristic_text_generation',
]
data_path ='data/HelloBench'
hellobench_datasets = []
for category_name in hellobench_categories:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=16384),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = user_prompt
),
]),
),
dict_postprocessor=dict(type=hellobench_postprocess,),
),
pred_role='BOT',
)
hellobench_datasets.append(
dict(
abbr=f'HelloBench-{category_name}',
type=HelloBenchDataset,
path=data_path,
category_name=category_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -16,6 +16,7 @@ from .flames import FlamesDataset # noqa: F401, F403
from .fofo import FofoDataset, fofo_postprocess # noqa: F401, F403
from .followbench import FollowBenchDataset # noqa: F401, F403
from .followbench import followbench_postprocess
from .hellobench import * # noqa: F401, F403
from .judgerbench import JudgerBenchDataset # noqa: F401, F403
from .judgerbench import JudgerBenchEvaluator # noqa: F401, F403
from .mtbench import MTBenchDataset, mtbench_postprocess # noqa: F401, F403

View File

@ -0,0 +1,274 @@
# flake8: noqa: E501
import json
import numpy as np
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from ..base import BaseDataset
REGRESSION_DICT = {
'travel': [
14.816357060044538, 9.912640189521913, 6.854178078417421,
16.548365732493735, 12.49440306294194, 19.925026350726633,
19.449029525853824
],
'Tech': [
9.730382391494699, 15.439961810101806, 8.71267868266836,
17.047912114497525, 8.188210881912578, 18.27285649160541,
22.607997627719627
],
'Sport': [
10.470669731543392, 9.628138754444748, 5.8376755613192275,
18.908737698203687, 11.170106247242, 22.555525595175727,
21.42914641207122
],
'Science': [
10.215624094265426, 11.85130160404758, 13.199743482703303,
15.771351181725294, 10.772433227719386, 18.259334358981764,
19.93021205055725
],
'music': [
10.570558131445923, 10.250703197641212, 8.71555097518865,
20.767746121844873, 15.130089494653312, 17.459999261696932,
17.10535281752909
],
'health': [
14.409021815474166, 8.996654196952731, 9.058311451032425,
20.374818020413127, 13.113089390107218, 13.622853268531996,
20.425251857488348
],
'write': [
17.20178646947119, 11.647858398657238, 13.549614784591249,
18.451414657788348, 8.415665936780018, 15.693785853424465,
15.039873899287489
],
'book': [
10.987786546263385, 10.80583601777249, 11.110641533898052,
21.917965372650762, 7.7575931269958955, 15.37978249492496,
22.040394907494466
],
'food': [
10.88637513076461, 11.972608253231327, 13.762365658958538,
18.449103701644535, 10.828866753473488, 15.403319360473219,
18.69736114145427
],
'movie': [
13.987702750429126, 13.781107282170971, 10.70081300442185,
14.950249677014197, 9.043151114164273, 14.990326778304123,
22.54664939349545
],
'long_dialogue': [
12.655129633685263, 12.128629670452108, 15.417359033606798,
8.805077038076321, 22.44683162734655, 19.3826287336546,
9.164344263178364
],
'blogs': [
7.586054691386359, 19.535003668901773, 15.361732493611802,
17.16924394200404, 16.86984484117092, 23.47812036292512
],
'academic_article': [
7.1513786821899865, 13.027210863148744, 17.517148962264663,
14.222879878391684, 18.018026707391165, 30.06335490661375
],
'report': [
8.962021075489186, 17.645150656180856, 17.07695284575253,
12.962529199816222, 18.77731391007885, 24.57603231268235
],
'news': [
5.746318852823619, 18.828108458188307, 18.348616241165825,
16.546667215885762, 20.49878321641544, 20.03150601552105
],
'question_generation': [
15.630644520221793, 17.815836405315725, 5.260151108793491,
5.260151108793491, 30.281435872156237, 25.751780984719254
],
'character_creation': [
13.387472615551518, 16.154170714995903, 5.3564749039425825,
17.745872651899493, 27.8316766814783, 19.5243324321322
],
'script_write': [
16.020800876858075, 12.537284513149297, 7.583604904411543,
10.962130120971509, 21.55253807214911, 31.343641512460472
],
'report_write': [
23.715406207770044, 11.322739017895511, 6.455129156251138,
7.266817046605194, 20.795517896089837, 30.444390675388284
],
'science_problem_solve': [
14.532002727010074, 13.988091295206875, 5.78110629330191,
13.906652976851941, 29.749526456076786, 22.042620251552407
],
'academic_write': [
18.274980968685334, 17.668799475735167, 5.373737221539396,
15.33990358340595, 27.116855004727352, 16.225723745906805
],
'guide_generation': [
24.991645087603484, 12.995989180532902, 11.348066943331492,
13.176536571757417, 19.238518079064633, 18.249244137710075
],
'creative_write': [
20.56735945510573, 13.865892755893375, 9.95947810767433,
16.610586533096885, 21.307725530193018, 17.68895761803666
],
'question_answering': [
14.257396776453227, 12.59853746572811, 5.7410180060529985,
15.959901439015228, 28.83810948056622, 22.60503683218423
],
'curriculum_development': [
20.68850512855878, 22.200461872620195, 5.8343282109082,
5.8343282109082, 17.89639729448703, 27.545979282517592
],
'continue_write': [
18.669885223104068, 21.418933575454858, 13.841889274353397,
6.502715042824038, 17.14288545529491, 22.423691428968727
],
'idea_generation': [
16.608491609592104, 24.45709647197801, 12.235414254617053,
5.504078770891624, 18.79437075684626, 22.400548136074956
],
'data_analysis': [
18.29675276651988, 5.722157365550123, 5.740218388378298,
20.92508664739828, 26.510684489335194, 22.80510034281823
],
'rewrite': [
20.801683025093183, 8.510828270810512, 11.130570080160155,
13.722027611417639, 19.803701313664753, 26.03118969885375
],
'explanation': [
10.313604819556165, 18.631545950717513, 16.412914400566404,
11.838586893660816, 19.111282531748692, 23.69206540375043
],
'continuation': [
21.427707308340644, 19.022158840412466, 16.220256947514333,
20.57043807105919, 22.759438832673375
],
'imitative_writing': [
19.87078310837695, 19.793380163686955, 19.346176082395687,
21.77086167116268, 19.218798974377737
],
'style_transfer': [
16.438886068023052, 18.226961726018953, 21.448441756584106,
23.961762450033103, 19.923947999340776
],
'story_writing': [
23.5319284319259, 22.420937450120597, 10.539906363853124,
17.047083302574496, 26.460144451525895
],
'keyword_writing': [
16.27370693012242, 27.30111800645728, 15.728682122621054,
18.81389796206547, 21.882594978733778
],
'screenplay_writing': [
19.822086987393824, 20.973270981524056, 17.095645893112255,
19.56592278203641, 22.543073355933444
],
'argumentative_writing': [
18.302865025230115, 24.50501277580138, 20.483643154138807,
14.552018259438853, 22.15646078539085
],
'roleplaying_writing': [
18.23837535323756, 22.299189217994243, 12.860964861892231,
19.918295740192793, 26.683174826683164
]
}
@LOAD_DATASET.register_module()
class HelloBenchDataset(BaseDataset):
def load(self, path: str, category_name: str, *args, **kwargs):
with open(f'{path}/{category_name}.jsonl', 'r', encoding='utf-8') as f:
hellobench_dataset = [json.loads(line) for line in f.readlines()]
for hellobench_dict in hellobench_dataset:
hellobench_dict['judgement'] = {
'category': category_name,
'subcategory': hellobench_dict['category'],
'num_checklist': hellobench_dict['num_checklist']
}
dataset = Dataset.from_list(hellobench_dataset)
return dataset
def post_process_hellobench(judgement):
"""Input a string like below:
{'checklist_id': 0, 'reason': 'xxx', 'evaluation_score': 0.5}
and extract each score
"""
num_checklist = judgement['gold']['num_checklist']
judgement = judgement['prediction']
try:
judgement = judgement.replace('```json',
'').replace('```python',
'').replace('```', '')
judgement = judgement.replace('\n', '').replace('\\', '')
judgement_list = json.loads(judgement)
return_list = []
for judgement_dict in judgement_list:
judgement_dict['checklist_id'] = int(
judgement_dict['checklist_id'])
judgement_dict['evaluation_score'] = float(
judgement_dict['evaluation_score'])
assert judgement_dict['evaluation_score'] <= 1.0 and judgement_dict[
'evaluation_score'] >= 0.0
return_list.append(judgement_dict['evaluation_score'])
assert len(return_list) == num_checklist
return return_list
except:
return None
def get_judgeanswer(result, filename, post_process):
"""Extract judgements (scores)
Args:
result (dict): result dict.
filename (str): result path.
post_process (function): The pre-defined extract function.
"""
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename)
print('*' * 100)
rescaled_score_dict = {}
for k, v in result.items():
processed_judge = post_process(v)
if processed_judge is not None:
subcategory = v['gold']['subcategory']
weighted_dict = REGRESSION_DICT[subcategory]
overall_score = np.dot(weighted_dict, processed_judge)
rescaled_score = (overall_score - 75) * 4
rescaled_score_dict[k] = rescaled_score
if len(rescaled_score_dict) <= 0.95 * len(result):
print('*' * 100)
print(
f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(rescaled_score_dict)} judgements, please check!'
)
print('*' * 100)
return rescaled_score_dict
@DICT_POSTPROCESSORS.register_module('hellobench')
def hellobench_postprocess(
output: dict,
output_path: str,
) -> dict:
rescaled_score_dict = get_judgeanswer(output, output_path,
post_process_hellobench)
results = {}
results['overall_score'] = np.mean(list(rescaled_score_dict.values()))
results['details'] = output
for k, v in results['details'].items():
if k in rescaled_score_dict:
results['details'][k]['rescaled_score'] = rescaled_score_dict[k]
else:
results['details'][k]['rescaled_score'] = None
return results

View File

@ -324,7 +324,9 @@ class DefaultSubjectiveSummarizer:
else:
output_csv_path = output_path.replace('.txt', '.csv')
output_path = output_path.split('.txt')[0] + '_by_' + judge_abbr + '.txt'
output_csv_path = output_csv_path.split('.csv')[0] + '_by_' + judge_abbr + '.csv'
output_dir = osp.split(output_path)[0]
mmengine.mkdir_or_exist(output_dir)
with open(output_path, 'w', encoding='utf-8') as f: