# flake8: noqa: E501 import json import numpy as np from datasets import Dataset from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET from ..base import BaseDataset REGRESSION_DICT = { 'travel': [ 14.816357060044538, 9.912640189521913, 6.854178078417421, 16.548365732493735, 12.49440306294194, 19.925026350726633, 19.449029525853824 ], 'Tech': [ 9.730382391494699, 15.439961810101806, 8.71267868266836, 17.047912114497525, 8.188210881912578, 18.27285649160541, 22.607997627719627 ], 'Sport': [ 10.470669731543392, 9.628138754444748, 5.8376755613192275, 18.908737698203687, 11.170106247242, 22.555525595175727, 21.42914641207122 ], 'Science': [ 10.215624094265426, 11.85130160404758, 13.199743482703303, 15.771351181725294, 10.772433227719386, 18.259334358981764, 19.93021205055725 ], 'music': [ 10.570558131445923, 10.250703197641212, 8.71555097518865, 20.767746121844873, 15.130089494653312, 17.459999261696932, 17.10535281752909 ], 'health': [ 14.409021815474166, 8.996654196952731, 9.058311451032425, 20.374818020413127, 13.113089390107218, 13.622853268531996, 20.425251857488348 ], 'write': [ 17.20178646947119, 11.647858398657238, 13.549614784591249, 18.451414657788348, 8.415665936780018, 15.693785853424465, 15.039873899287489 ], 'book': [ 10.987786546263385, 10.80583601777249, 11.110641533898052, 21.917965372650762, 7.7575931269958955, 15.37978249492496, 22.040394907494466 ], 'food': [ 10.88637513076461, 11.972608253231327, 13.762365658958538, 18.449103701644535, 10.828866753473488, 15.403319360473219, 18.69736114145427 ], 'movie': [ 13.987702750429126, 13.781107282170971, 10.70081300442185, 14.950249677014197, 9.043151114164273, 14.990326778304123, 22.54664939349545 ], 'long_dialogue': [ 12.655129633685263, 12.128629670452108, 15.417359033606798, 8.805077038076321, 22.44683162734655, 19.3826287336546, 9.164344263178364 ], 'blogs': [ 7.586054691386359, 19.535003668901773, 15.361732493611802, 17.16924394200404, 16.86984484117092, 23.47812036292512 ], 'academic_article': [ 7.1513786821899865, 13.027210863148744, 17.517148962264663, 14.222879878391684, 18.018026707391165, 30.06335490661375 ], 'report': [ 8.962021075489186, 17.645150656180856, 17.07695284575253, 12.962529199816222, 18.77731391007885, 24.57603231268235 ], 'news': [ 5.746318852823619, 18.828108458188307, 18.348616241165825, 16.546667215885762, 20.49878321641544, 20.03150601552105 ], 'question_generation': [ 15.630644520221793, 17.815836405315725, 5.260151108793491, 5.260151108793491, 30.281435872156237, 25.751780984719254 ], 'character_creation': [ 13.387472615551518, 16.154170714995903, 5.3564749039425825, 17.745872651899493, 27.8316766814783, 19.5243324321322 ], 'script_write': [ 16.020800876858075, 12.537284513149297, 7.583604904411543, 10.962130120971509, 21.55253807214911, 31.343641512460472 ], 'report_write': [ 23.715406207770044, 11.322739017895511, 6.455129156251138, 7.266817046605194, 20.795517896089837, 30.444390675388284 ], 'science_problem_solve': [ 14.532002727010074, 13.988091295206875, 5.78110629330191, 13.906652976851941, 29.749526456076786, 22.042620251552407 ], 'academic_write': [ 18.274980968685334, 17.668799475735167, 5.373737221539396, 15.33990358340595, 27.116855004727352, 16.225723745906805 ], 'guide_generation': [ 24.991645087603484, 12.995989180532902, 11.348066943331492, 13.176536571757417, 19.238518079064633, 18.249244137710075 ], 'creative_write': [ 20.56735945510573, 13.865892755893375, 9.95947810767433, 16.610586533096885, 21.307725530193018, 17.68895761803666 ], 'question_answering': [ 14.257396776453227, 12.59853746572811, 5.7410180060529985, 15.959901439015228, 28.83810948056622, 22.60503683218423 ], 'curriculum_development': [ 20.68850512855878, 22.200461872620195, 5.8343282109082, 5.8343282109082, 17.89639729448703, 27.545979282517592 ], 'continue_write': [ 18.669885223104068, 21.418933575454858, 13.841889274353397, 6.502715042824038, 17.14288545529491, 22.423691428968727 ], 'idea_generation': [ 16.608491609592104, 24.45709647197801, 12.235414254617053, 5.504078770891624, 18.79437075684626, 22.400548136074956 ], 'data_analysis': [ 18.29675276651988, 5.722157365550123, 5.740218388378298, 20.92508664739828, 26.510684489335194, 22.80510034281823 ], 'rewrite': [ 20.801683025093183, 8.510828270810512, 11.130570080160155, 13.722027611417639, 19.803701313664753, 26.03118969885375 ], 'explanation': [ 10.313604819556165, 18.631545950717513, 16.412914400566404, 11.838586893660816, 19.111282531748692, 23.69206540375043 ], 'continuation': [ 21.427707308340644, 19.022158840412466, 16.220256947514333, 20.57043807105919, 22.759438832673375 ], 'imitative_writing': [ 19.87078310837695, 19.793380163686955, 19.346176082395687, 21.77086167116268, 19.218798974377737 ], 'style_transfer': [ 16.438886068023052, 18.226961726018953, 21.448441756584106, 23.961762450033103, 19.923947999340776 ], 'story_writing': [ 23.5319284319259, 22.420937450120597, 10.539906363853124, 17.047083302574496, 26.460144451525895 ], 'keyword_writing': [ 16.27370693012242, 27.30111800645728, 15.728682122621054, 18.81389796206547, 21.882594978733778 ], 'screenplay_writing': [ 19.822086987393824, 20.973270981524056, 17.095645893112255, 19.56592278203641, 22.543073355933444 ], 'argumentative_writing': [ 18.302865025230115, 24.50501277580138, 20.483643154138807, 14.552018259438853, 22.15646078539085 ], 'roleplaying_writing': [ 18.23837535323756, 22.299189217994243, 12.860964861892231, 19.918295740192793, 26.683174826683164 ] } @LOAD_DATASET.register_module() class HelloBenchDataset(BaseDataset): def load(self, path: str, category_name: str, *args, **kwargs): with open(f'{path}/{category_name}.jsonl', 'r', encoding='utf-8') as f: hellobench_dataset = [json.loads(line) for line in f.readlines()] for hellobench_dict in hellobench_dataset: hellobench_dict['judgement'] = { 'category': category_name, 'subcategory': hellobench_dict['category'], 'num_checklist': hellobench_dict['num_checklist'] } dataset = Dataset.from_list(hellobench_dataset) return dataset def post_process_hellobench(judgement): """Input a string like below: {'checklist_id': 0, 'reason': 'xxx', 'evaluation_score': 0.5} and extract each score """ num_checklist = judgement['gold']['num_checklist'] judgement = judgement['prediction'] try: judgement = judgement.replace('```json', '').replace('```python', '').replace('```', '') judgement = judgement.replace('\n', '').replace('\\', '') judgement_list = json.loads(judgement) return_list = [] for judgement_dict in judgement_list: judgement_dict['checklist_id'] = int( judgement_dict['checklist_id']) judgement_dict['evaluation_score'] = float( judgement_dict['evaluation_score']) assert judgement_dict['evaluation_score'] <= 1.0 and judgement_dict[ 'evaluation_score'] >= 0.0 return_list.append(judgement_dict['evaluation_score']) assert len(return_list) == num_checklist return return_list except: return None def get_judgeanswer(result, filename, post_process): """Extract judgements (scores) Args: result (dict): result dict. filename (str): result path. post_process (function): The pre-defined extract function. """ if len(result) == 0: print('*' * 100) print('There are no results for ' + filename) print('*' * 100) rescaled_score_dict = {} for k, v in result.items(): processed_judge = post_process(v) if processed_judge is not None: subcategory = v['gold']['subcategory'] weighted_dict = REGRESSION_DICT[subcategory] overall_score = np.dot(weighted_dict, processed_judge) rescaled_score = (overall_score - 75) * 4 rescaled_score_dict[k] = rescaled_score if len(rescaled_score_dict) <= 0.95 * len(result): print('*' * 100) print( f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(rescaled_score_dict)} judgements, please check!' ) print('*' * 100) return rescaled_score_dict @DICT_POSTPROCESSORS.register_module('hellobench') def hellobench_postprocess( output: dict, output_path: str, ) -> dict: rescaled_score_dict = get_judgeanswer(output, output_path, post_process_hellobench) results = {} results['overall_score'] = np.mean(list(rescaled_score_dict.values())) results['details'] = output for k, v in results['details'].items(): if k in rescaled_score_dict: results['details'][k]['rescaled_score'] = rescaled_score_dict[k] else: results['details'][k]['rescaled_score'] = None return results