Upload HelloBench (#1607)

* upload hellobench * update hellobench * update readme.md * update eval_hellobench.py * update lastest --------- Co-authored-by: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
2025-05-30 16:03:24 +08:00 · 2024-10-15 17:11:37 +08:00 · 2024-10-15 17:11:37 +08:00 · 4fe251729b
commit 4fe251729b
parent fa54aa62f6
6 changed files with 535 additions and 0 deletions
--- a/configs/eval_hellobench.py
+++ b/configs/eval_hellobench.py
@ -0,0 +1,96 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.subjective.hellobench.hellobench import hellobench_datasets
+from opencompass.models import HuggingFacewithChatTemplate, OpenAI
+from opencompass.partitioners import NaivePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+# make sure your models' generation parameters are set properly, for example, if you set temperature=0.8, make sure you set all models' temperature to 0.8
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='glm-4-9b-chat-hf',
+        path='THUDM/glm-4-9b-chat',
+        max_out_len=16384,
+        generation_kwargs=dict(
+            temperature=0.8,
+            do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
+        ),
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        batch_size=1,
+        run_cfg=dict(num_gpus=2, num_procs=1),
+        stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
+    )
+]
+
+datasets = [*hellobench_datasets] # add datasets you want
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+
+# ------------- JudgeLLM Configuration
+# we recommand to use gpt4o-mini as the judge model
+
+# if you want to use open-source LLMs as judge models, you can uncomment the following code
+# judge_models = [
+#     dict(
+#         type=HuggingFacewithChatTemplate,
+#         abbr='glm-4-9b-chat-hf',
+#         path='THUDM/glm-4-9b-chat',
+#         max_out_len=16384,
+#         generation_kwargs=dict(
+#             temperature=0.8,
+#             do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
+#         ),
+#         model_kwargs=dict(
+#             device_map='auto',
+#             trust_remote_code=True,
+#         ),
+#         batch_size=1,
+#         run_cfg=dict(num_gpus=2, num_procs=1),
+#         stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
+#     )
+# ]
+
+judge_models = [dict(
+    abbr='GPT4o',
+    type=OpenAI,
+    path='gpt-4o',
+    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=16,
+    max_out_len=4096,
+    batch_size=1,
+    temperature=0.8,
+    seed=42,
+)]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+work_dir = 'outputs/hellobench/'
--- a/opencompass/configs/datasets/subjective/hellobench/README.md
+++ b/opencompass/configs/datasets/subjective/hellobench/README.md
@ -0,0 +1,51 @@
+# Guideline for evaluating HelloBench on Diverse LLMs
+
+HelloBench is a comprehenvise, in-the-wild, and open-ended benchmark to evaluate LLMs' performance in generating long text. More details could be found in [🌐Github Repo](https://github.com/Quehry/HelloBench) and [📖Paper](https://arxiv.org/abs/2409.16191).
+
+## Detailed instructions to evalute HelloBench in Opencompass
+
+1. Git clone Opencompass
+
+```shell
+cd ~
+git clone git@github.com:open-compass/opencompass.git
+cd opencompass
+```
+
+2. Download HelloBench data in [Google Drive Url](https://drive.google.com/file/d/1EJTmMFgCs2pDy9l0wB5idvp3XzjYEsi9/view?usp=sharing), unzip it and put it in the following path(OPENCOMPASS_PATH/data/HelloBench), make sure you get path like this:
+
+```
+~/opencompass/data/
+└── HelloBench
+    ├── chat.jsonl
+    ├── heuristic_text_generation.jsonl
+    ├── length_constrained_data
+    │   ├── heuristic_text_generation_16k.jsonl
+    │   ├── heuristic_text_generation_2k.jsonl
+    │   ├── heuristic_text_generation_4k.jsonl
+    │   └── heuristic_text_generation_8k.jsonl
+    ├── open_ended_qa.jsonl
+    ├── summarization.jsonl
+    └── text_completion.jsonl
+```
+
+3. Setup your opencompass
+
+```
+cd ~/opencompass
+pip install -e .
+```
+
+4. configuration your launch in configs/eval_hellobench.py
+
+- set your models to be evaluated
+
+- set your judge model (we recommend to use gpt4o-mini)
+
+5. launch it!
+
+```
+python run.py configs/eval_hellobench.py
+```
+
+6. After that, you could find the results in outputs/hellobench/xxx/summary
--- a/opencompass/configs/datasets/subjective/hellobench/hellobench.py
+++ b/opencompass/configs/datasets/subjective/hellobench/hellobench.py
@ -0,0 +1,111 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import HelloBenchDataset, hellobench_postprocess
+
+system_prompt = """You are a helpful evaluator. Your task is to evaluate the checklists of the responses given by the Large Language Models (LLMs) based on user instructions. These checklists consist of yes or no questions."""
+
+user_prompt = """Your core task is to evaluate the checklists based on the user’s instruction and LLM’s response, with each checklist item being a yes or no question indicating a specific aspect that the LLM’s response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
+
+0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
+0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
+0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
+0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
+1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
+
+Here is the instruction:
+{{\"instruction\": {instruction}}}
+
+Here is the response given by LLM:
+{{\"response\": {prediction}}}
+
+Since the response may be rather long, I am specifically reminding you here that the response has ended.
+
+Here are checklists of this instruction:
+{{\"checklists\": {formatted_checklists}}}
+
+To further remind you, I will repeat my requirements:
+
+Your core task is to evaluate the checklists based on the user’s instruction and LLM’s response, with each checklist item being a yes or no question indicating a specific aspect that the LLM’s response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
+
+0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
+0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
+0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
+0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
+1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
+
+Always provide the reason for your evaluation results. You should be strict but fair in your evaluation. A score of 1 means that the response perfectly meets all the checklist requirements and you think there are really no room for improvements. When giving a score of 1, you need to carefully consider whether this checklist has been perfectly satisfied.
+
+Evaluate all the checklists and return the evaluation results of the checklists. Output a Python List consisting of the Python Dictionary formatted as follows:
+[{{\"checklist_id\": \"the id of the checklist\", \"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}},{{\"checklist_id\": \"the id of the checklist\",
+\"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}}]
+
+There are total {num_checklist} checklists that you need to evaluate. The length of the output list is equal to the number of checklists and you should give an evaluation score for each checklist. You shoule be very very very strict to the evalution to further compare the responses from different models. Your response must be a valid Python List and should contain nothing else, as it will be directly executed in Python."""
+
+subjective_reader_cfg = dict(
+    input_columns=['instruction', 'formatted_checklists', 'num_checklist'],
+    output_column='judgement',
+    )
+
+hellobench_categories = [
+    'open_ended_qa',
+    'summarization',
+    'chat',
+    'text_completion',
+    'heuristic_text_generation',
+]
+data_path ='data/HelloBench'
+
+hellobench_datasets = []
+
+for category_name in hellobench_categories:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{instruction}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=16384),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=system_prompt)
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = user_prompt
+                        ),
+                    ]),
+            ),
+            dict_postprocessor=dict(type=hellobench_postprocess,),
+        ),
+        pred_role='BOT',
+    )
+
+    hellobench_datasets.append(
+        dict(
+            abbr=f'HelloBench-{category_name}',
+            type=HelloBenchDataset,
+            path=data_path,
+            category_name=category_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -16,6 +16,7 @@ from .flames import FlamesDataset  # noqa: F401, F403
 from .fofo import FofoDataset, fofo_postprocess  # noqa: F401, F403
 from .followbench import FollowBenchDataset  # noqa: F401, F403
 from .followbench import followbench_postprocess
+from .hellobench import *  # noqa: F401, F403
 from .judgerbench import JudgerBenchDataset  # noqa: F401, F403
 from .judgerbench import JudgerBenchEvaluator  # noqa: F401, F403
 from .mtbench import MTBenchDataset, mtbench_postprocess  # noqa: F401, F403
--- a/opencompass/datasets/subjective/hellobench.py
+++ b/opencompass/datasets/subjective/hellobench.py
@ -0,0 +1,274 @@
+# flake8: noqa: E501
+import json
+
+import numpy as np
+from datasets import Dataset
+
+from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
+
+from ..base import BaseDataset
+
+REGRESSION_DICT = {
+    'travel': [
+        14.816357060044538, 9.912640189521913, 6.854178078417421,
+        16.548365732493735, 12.49440306294194, 19.925026350726633,
+        19.449029525853824
+    ],
+    'Tech': [
+        9.730382391494699, 15.439961810101806, 8.71267868266836,
+        17.047912114497525, 8.188210881912578, 18.27285649160541,
+        22.607997627719627
+    ],
+    'Sport': [
+        10.470669731543392, 9.628138754444748, 5.8376755613192275,
+        18.908737698203687, 11.170106247242, 22.555525595175727,
+        21.42914641207122
+    ],
+    'Science': [
+        10.215624094265426, 11.85130160404758, 13.199743482703303,
+        15.771351181725294, 10.772433227719386, 18.259334358981764,
+        19.93021205055725
+    ],
+    'music': [
+        10.570558131445923, 10.250703197641212, 8.71555097518865,
+        20.767746121844873, 15.130089494653312, 17.459999261696932,
+        17.10535281752909
+    ],
+    'health': [
+        14.409021815474166, 8.996654196952731, 9.058311451032425,
+        20.374818020413127, 13.113089390107218, 13.622853268531996,
+        20.425251857488348
+    ],
+    'write': [
+        17.20178646947119, 11.647858398657238, 13.549614784591249,
+        18.451414657788348, 8.415665936780018, 15.693785853424465,
+        15.039873899287489
+    ],
+    'book': [
+        10.987786546263385, 10.80583601777249, 11.110641533898052,
+        21.917965372650762, 7.7575931269958955, 15.37978249492496,
+        22.040394907494466
+    ],
+    'food': [
+        10.88637513076461, 11.972608253231327, 13.762365658958538,
+        18.449103701644535, 10.828866753473488, 15.403319360473219,
+        18.69736114145427
+    ],
+    'movie': [
+        13.987702750429126, 13.781107282170971, 10.70081300442185,
+        14.950249677014197, 9.043151114164273, 14.990326778304123,
+        22.54664939349545
+    ],
+    'long_dialogue': [
+        12.655129633685263, 12.128629670452108, 15.417359033606798,
+        8.805077038076321, 22.44683162734655, 19.3826287336546,
+        9.164344263178364
+    ],
+    'blogs': [
+        7.586054691386359, 19.535003668901773, 15.361732493611802,
+        17.16924394200404, 16.86984484117092, 23.47812036292512
+    ],
+    'academic_article': [
+        7.1513786821899865, 13.027210863148744, 17.517148962264663,
+        14.222879878391684, 18.018026707391165, 30.06335490661375
+    ],
+    'report': [
+        8.962021075489186, 17.645150656180856, 17.07695284575253,
+        12.962529199816222, 18.77731391007885, 24.57603231268235
+    ],
+    'news': [
+        5.746318852823619, 18.828108458188307, 18.348616241165825,
+        16.546667215885762, 20.49878321641544, 20.03150601552105
+    ],
+    'question_generation': [
+        15.630644520221793, 17.815836405315725, 5.260151108793491,
+        5.260151108793491, 30.281435872156237, 25.751780984719254
+    ],
+    'character_creation': [
+        13.387472615551518, 16.154170714995903, 5.3564749039425825,
+        17.745872651899493, 27.8316766814783, 19.5243324321322
+    ],
+    'script_write': [
+        16.020800876858075, 12.537284513149297, 7.583604904411543,
+        10.962130120971509, 21.55253807214911, 31.343641512460472
+    ],
+    'report_write': [
+        23.715406207770044, 11.322739017895511, 6.455129156251138,
+        7.266817046605194, 20.795517896089837, 30.444390675388284
+    ],
+    'science_problem_solve': [
+        14.532002727010074, 13.988091295206875, 5.78110629330191,
+        13.906652976851941, 29.749526456076786, 22.042620251552407
+    ],
+    'academic_write': [
+        18.274980968685334, 17.668799475735167, 5.373737221539396,
+        15.33990358340595, 27.116855004727352, 16.225723745906805
+    ],
+    'guide_generation': [
+        24.991645087603484, 12.995989180532902, 11.348066943331492,
+        13.176536571757417, 19.238518079064633, 18.249244137710075
+    ],
+    'creative_write': [
+        20.56735945510573, 13.865892755893375, 9.95947810767433,
+        16.610586533096885, 21.307725530193018, 17.68895761803666
+    ],
+    'question_answering': [
+        14.257396776453227, 12.59853746572811, 5.7410180060529985,
+        15.959901439015228, 28.83810948056622, 22.60503683218423
+    ],
+    'curriculum_development': [
+        20.68850512855878, 22.200461872620195, 5.8343282109082,
+        5.8343282109082, 17.89639729448703, 27.545979282517592
+    ],
+    'continue_write': [
+        18.669885223104068, 21.418933575454858, 13.841889274353397,
+        6.502715042824038, 17.14288545529491, 22.423691428968727
+    ],
+    'idea_generation': [
+        16.608491609592104, 24.45709647197801, 12.235414254617053,
+        5.504078770891624, 18.79437075684626, 22.400548136074956
+    ],
+    'data_analysis': [
+        18.29675276651988, 5.722157365550123, 5.740218388378298,
+        20.92508664739828, 26.510684489335194, 22.80510034281823
+    ],
+    'rewrite': [
+        20.801683025093183, 8.510828270810512, 11.130570080160155,
+        13.722027611417639, 19.803701313664753, 26.03118969885375
+    ],
+    'explanation': [
+        10.313604819556165, 18.631545950717513, 16.412914400566404,
+        11.838586893660816, 19.111282531748692, 23.69206540375043
+    ],
+    'continuation': [
+        21.427707308340644, 19.022158840412466, 16.220256947514333,
+        20.57043807105919, 22.759438832673375
+    ],
+    'imitative_writing': [
+        19.87078310837695, 19.793380163686955, 19.346176082395687,
+        21.77086167116268, 19.218798974377737
+    ],
+    'style_transfer': [
+        16.438886068023052, 18.226961726018953, 21.448441756584106,
+        23.961762450033103, 19.923947999340776
+    ],
+    'story_writing': [
+        23.5319284319259, 22.420937450120597, 10.539906363853124,
+        17.047083302574496, 26.460144451525895
+    ],
+    'keyword_writing': [
+        16.27370693012242, 27.30111800645728, 15.728682122621054,
+        18.81389796206547, 21.882594978733778
+    ],
+    'screenplay_writing': [
+        19.822086987393824, 20.973270981524056, 17.095645893112255,
+        19.56592278203641, 22.543073355933444
+    ],
+    'argumentative_writing': [
+        18.302865025230115, 24.50501277580138, 20.483643154138807,
+        14.552018259438853, 22.15646078539085
+    ],
+    'roleplaying_writing': [
+        18.23837535323756, 22.299189217994243, 12.860964861892231,
+        19.918295740192793, 26.683174826683164
+    ]
+}
+
+
+@LOAD_DATASET.register_module()
+class HelloBenchDataset(BaseDataset):
+
+    def load(self, path: str, category_name: str, *args, **kwargs):
+        with open(f'{path}/{category_name}.jsonl', 'r', encoding='utf-8') as f:
+            hellobench_dataset = [json.loads(line) for line in f.readlines()]
+            for hellobench_dict in hellobench_dataset:
+                hellobench_dict['judgement'] = {
+                    'category': category_name,
+                    'subcategory': hellobench_dict['category'],
+                    'num_checklist': hellobench_dict['num_checklist']
+                }
+        dataset = Dataset.from_list(hellobench_dataset)
+        return dataset
+
+
+def post_process_hellobench(judgement):
+    """Input a string like below:
+
+    {'checklist_id': 0, 'reason': 'xxx', 'evaluation_score': 0.5}
+    and extract each score
+    """
+    num_checklist = judgement['gold']['num_checklist']
+    judgement = judgement['prediction']
+
+    try:
+        judgement = judgement.replace('```json',
+                                      '').replace('```python',
+                                                  '').replace('```', '')
+        judgement = judgement.replace('\n', '').replace('\\', '')
+        judgement_list = json.loads(judgement)
+        return_list = []
+        for judgement_dict in judgement_list:
+            judgement_dict['checklist_id'] = int(
+                judgement_dict['checklist_id'])
+            judgement_dict['evaluation_score'] = float(
+                judgement_dict['evaluation_score'])
+            assert judgement_dict['evaluation_score'] <= 1.0 and judgement_dict[
+                'evaluation_score'] >= 0.0
+            return_list.append(judgement_dict['evaluation_score'])
+        assert len(return_list) == num_checklist
+        return return_list
+    except:
+        return None
+
+
+def get_judgeanswer(result, filename, post_process):
+    """Extract judgements (scores)
+
+    Args:
+        result (dict): result dict.
+        filename (str): result path.
+        post_process (function): The pre-defined extract function.
+    """
+    if len(result) == 0:
+        print('*' * 100)
+        print('There are no results for ' + filename)
+        print('*' * 100)
+    rescaled_score_dict = {}
+    for k, v in result.items():
+        processed_judge = post_process(v)
+
+        if processed_judge is not None:
+            subcategory = v['gold']['subcategory']
+            weighted_dict = REGRESSION_DICT[subcategory]
+            overall_score = np.dot(weighted_dict, processed_judge)
+            rescaled_score = (overall_score - 75) * 4
+            rescaled_score_dict[k] = rescaled_score
+
+    if len(rescaled_score_dict) <= 0.95 * len(result):
+        print('*' * 100)
+        print(
+            f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(rescaled_score_dict)} judgements, please check!'
+        )
+        print('*' * 100)
+    return rescaled_score_dict
+
+
+@DICT_POSTPROCESSORS.register_module('hellobench')
+def hellobench_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+    rescaled_score_dict = get_judgeanswer(output, output_path,
+                                          post_process_hellobench)
+
+    results = {}
+    results['overall_score'] = np.mean(list(rescaled_score_dict.values()))
+    results['details'] = output
+
+    for k, v in results['details'].items():
+        if k in rescaled_score_dict:
+            results['details'][k]['rescaled_score'] = rescaled_score_dict[k]
+        else:
+            results['details'][k]['rescaled_score'] = None
+
+    return results
--- a/opencompass/summarizers/default_subjective.py
+++ b/opencompass/summarizers/default_subjective.py
@ -324,7 +324,9 @@ class DefaultSubjectiveSummarizer:
        else:
            output_csv_path = output_path.replace('.txt', '.csv')
        output_path = output_path.split('.txt')[0] + '_by_' + judge_abbr + '.txt'
+
        output_csv_path = output_csv_path.split('.csv')[0] + '_by_' + judge_abbr + '.csv'
+
        output_dir = osp.split(output_path)[0]
        mmengine.mkdir_or_exist(output_dir)
        with open(output_path, 'w', encoding='utf-8') as f: