mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Update Models, Summarizers (#1600)
This commit is contained in:
parent
d91d66792a
commit
df57c08ccf
@ -0,0 +1,13 @@
|
|||||||
|
from opencompass.models import HuggingFacewithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFacewithChatTemplate,
|
||||||
|
abbr='llama-3_2-3b-instruct-hf',
|
||||||
|
path='meta-llama/Llama-3.2-3B-Instruct',
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
stop_words=['<|end_of_text|>', '<|eot_id|>'],
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,16 @@
|
|||||||
|
from opencompass.models import TurboMindModelwithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModelwithChatTemplate,
|
||||||
|
abbr='llama-3_2-3b-instruct-turbomind',
|
||||||
|
path='meta-llama/Llama-3.2-3B-Instruct',
|
||||||
|
engine_config=dict(max_batch_size=16, tp=1),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||||
|
max_seq_len=16384,
|
||||||
|
max_out_len=4096,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
stop_words=['<|end_of_text|>', '<|eot_id|>'],
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,12 @@
|
|||||||
|
from opencompass.models import HuggingFacewithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFacewithChatTemplate,
|
||||||
|
abbr='mistral-nemo-instruct-2407-hf',
|
||||||
|
path='mistralai/Mistral-Nemo-Instruct-2407',
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,12 @@
|
|||||||
|
from opencompass.models import HuggingFacewithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFacewithChatTemplate,
|
||||||
|
abbr='mistral-small-instruct-2409-hf',
|
||||||
|
path='mistralai/Mistral-Small-Instruct-2409',
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=2),
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModelwithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModelwithChatTemplate,
|
||||||
|
abbr='mistral-nemo-instruct-2407-turbomind',
|
||||||
|
path='mistralai/Mistral-Nemo-Instruct-2407',
|
||||||
|
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||||
|
max_seq_len=32768,
|
||||||
|
max_out_len=4096,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModelwithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModelwithChatTemplate,
|
||||||
|
abbr="mistral-small-instruct-2409-turbomind",
|
||||||
|
path="mistralai/Mistral-Small-Instruct-2409",
|
||||||
|
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||||
|
max_seq_len=32768,
|
||||||
|
max_out_len=4096,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=2),
|
||||||
|
)
|
||||||
|
]
|
@ -26,7 +26,7 @@ class CompassBenchObjectiveV1_3(BaseDataset):
|
|||||||
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
|
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
|
||||||
|
|
||||||
data = []
|
data = []
|
||||||
with open(path, 'r') as infile:
|
with open(path, 'r', encoding='utf-8', errors='ignore') as infile:
|
||||||
for id, line in enumerate(infile):
|
for id, line in enumerate(infile):
|
||||||
entry = json.loads(line)
|
entry = json.loads(line)
|
||||||
if 'cloze' in name:
|
if 'cloze' in name:
|
||||||
|
@ -81,8 +81,8 @@ class BailingAPI(BaseAPIModel):
|
|||||||
self._headers = {'Authorization': f'Bearer {token}'}
|
self._headers = {'Authorization': f'Bearer {token}'}
|
||||||
|
|
||||||
self._headers['Content-Type'] = 'application/json'
|
self._headers['Content-Type'] = 'application/json'
|
||||||
self._url = url if url else \
|
self._url = (url if url else
|
||||||
'https://bailingchat.alipay.com/chat/completions'
|
'https://bailingchat.alipay.com/chat/completions')
|
||||||
self._model = path
|
self._model = path
|
||||||
self._sessions = []
|
self._sessions = []
|
||||||
self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
|
self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
|
||||||
@ -136,9 +136,9 @@ class BailingAPI(BaseAPIModel):
|
|||||||
results.append('')
|
results.append('')
|
||||||
else:
|
else:
|
||||||
if (result.get('choices')
|
if (result.get('choices')
|
||||||
and result['choices'][0].get('message')
|
and result['choices'][0].get('message') and
|
||||||
and result['choices'][0]['message'].get(
|
result['choices'][0]['message'].get('content')
|
||||||
'content')):
|
is not None):
|
||||||
results.append(
|
results.append(
|
||||||
result['choices'][0]['message']['content'])
|
result['choices'][0]['message']['content'])
|
||||||
else:
|
else:
|
||||||
|
@ -466,25 +466,28 @@ class OpenAI(BaseAPIModel):
|
|||||||
|
|
||||||
class OpenAISDK(OpenAI):
|
class OpenAISDK(OpenAI):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
path: str = 'gpt-3.5-turbo',
|
self,
|
||||||
max_seq_len: int = 4096,
|
path: str = 'gpt-3.5-turbo',
|
||||||
query_per_second: int = 1,
|
max_seq_len: int = 4096,
|
||||||
rpm_verbose: bool = False,
|
query_per_second: int = 1,
|
||||||
retry: int = 2,
|
rpm_verbose: bool = False,
|
||||||
key: str | List[str] = 'ENV',
|
retry: int = 2,
|
||||||
org: str | List[str] | None = None,
|
key: str | List[str] = 'ENV',
|
||||||
meta_template: Dict | None = None,
|
org: str | List[str] | None = None,
|
||||||
openai_api_base: str = OPENAI_API_BASE,
|
meta_template: Dict | None = None,
|
||||||
openai_proxy_url: Optional[str] = None,
|
openai_api_base: str = OPENAI_API_BASE,
|
||||||
mode: str = 'none',
|
openai_proxy_url: Optional[str] = None,
|
||||||
logprobs: bool | None = False,
|
mode: str = 'none',
|
||||||
top_logprobs: int | None = None,
|
logprobs: bool | None = False,
|
||||||
temperature: float | None = None,
|
top_logprobs: int | None = None,
|
||||||
tokenizer_path: str | None = None,
|
temperature: float | None = None,
|
||||||
extra_body: Dict | None = None,
|
tokenizer_path: str | None = None,
|
||||||
max_completion_tokens: int = 16384,
|
extra_body: Dict | None = None,
|
||||||
verbose: bool = False):
|
max_completion_tokens: int = 16384,
|
||||||
|
verbose: bool = False,
|
||||||
|
status_code_mappings: dict = {},
|
||||||
|
):
|
||||||
super().__init__(path,
|
super().__init__(path,
|
||||||
max_seq_len,
|
max_seq_len,
|
||||||
query_per_second,
|
query_per_second,
|
||||||
@ -519,9 +522,11 @@ class OpenAISDK(OpenAI):
|
|||||||
http_client=httpx.Client(proxies=proxies))
|
http_client=httpx.Client(proxies=proxies))
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.logger.info(f'Used openai_client: {self.openai_client}')
|
self.logger.info(f'Used openai_client: {self.openai_client}')
|
||||||
|
self.status_code_mappings = status_code_mappings
|
||||||
|
|
||||||
def _generate(self, input: PromptList | str, max_out_len: int,
|
def _generate(self, input: PromptList | str, max_out_len: int,
|
||||||
temperature: float) -> str:
|
temperature: float) -> str:
|
||||||
|
from openai import BadRequestError
|
||||||
assert isinstance(input, (str, PromptList))
|
assert isinstance(input, (str, PromptList))
|
||||||
|
|
||||||
# max num token for gpt-3.5-turbo is 4097
|
# max num token for gpt-3.5-turbo is 4097
|
||||||
@ -605,7 +610,30 @@ class OpenAISDK(OpenAI):
|
|||||||
self.logger.info(responses)
|
self.logger.info(responses)
|
||||||
except Exception as e: # noqa F841
|
except Exception as e: # noqa F841
|
||||||
pass
|
pass
|
||||||
|
if not responses.choices:
|
||||||
|
self.logger.error(
|
||||||
|
'Response is empty, it is an internal server error \
|
||||||
|
from the API provider.')
|
||||||
return responses.choices[0].message.content
|
return responses.choices[0].message.content
|
||||||
|
|
||||||
|
except BadRequestError as e:
|
||||||
|
# Handle BadRequest status
|
||||||
|
# You can specify self.status_code_mappings to bypass \
|
||||||
|
# API sensitivity blocks
|
||||||
|
# For example: status_code_mappings={400: 'Input data \
|
||||||
|
# may contain inappropriate content.'}
|
||||||
|
status_code = e.status_code
|
||||||
|
if (status_code is not None
|
||||||
|
and status_code in self.status_code_mappings):
|
||||||
|
original_error_message = e.body.get('message')
|
||||||
|
error_message = self.status_code_mappings[status_code]
|
||||||
|
self.logger.info(
|
||||||
|
f'Status Code: {status_code}, '
|
||||||
|
f'Original Error Message: {original_error_message},'
|
||||||
|
f'Return Message: {error_message} ')
|
||||||
|
return error_message
|
||||||
|
else:
|
||||||
|
self.logger.error(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(e)
|
self.logger.error(e)
|
||||||
num_retries += 1
|
num_retries += 1
|
||||||
|
@ -29,13 +29,46 @@ def post_process_wildbench_pair(judgement: str):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
MAP = {'language':['总分','中文总分','英文总分','自然语言处理_cn','创作_cn','对话_cn','NLP_en','creation_en','chat_en'],
|
MAP = {
|
||||||
'instruct':['总分','中文总分','英文总分',],
|
'instruct': [
|
||||||
'reasoning':['总分','中文总分','英文总分','Common Sense Reasoning_cn','Social Reasoning_cn','Humanities (History, Finance, etc.) Professional Reasoning_cn', 'Science and Engineering Professional Reasoning_cn',
|
'总分',
|
||||||
'Common Sense Reasoning_en','Social Reasoning_en','Humanities (History, Finance, etc.) Professional Reasoning_en', 'Science and Engineering Professional Reasoning_en',],
|
'中文总分',
|
||||||
'coding':['总分','中文总分','英文总分',]}
|
'英文总分',
|
||||||
|
'instruct/compassbenchv1_4_IF_en_fofo_sub',
|
||||||
MAP = {'instruct':['总分','中文总分','英文总分',]}
|
'instruct/compassbenchv1_4_IF_zh_fofo_sub',
|
||||||
|
],
|
||||||
|
'language': [
|
||||||
|
'总分',
|
||||||
|
'中文总分',
|
||||||
|
'英文总分',
|
||||||
|
'language/compassbenchv1_4_language_zh_chat_sub',
|
||||||
|
'language/compassbenchv1_4_language_zh_creation_sub',
|
||||||
|
'language/compassbenchv1_4_language_zh_NLP_sub',
|
||||||
|
'language/compassbenchv1_4_language_en_chat_sub',
|
||||||
|
'language/compassbenchv1_4_language_en_creation_sub',
|
||||||
|
'language/compassbenchv1_4_language_en_NLP_sub',
|
||||||
|
],
|
||||||
|
'reasoning': [
|
||||||
|
'总分',
|
||||||
|
'中文总分',
|
||||||
|
'英文总分',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_en_Social_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub',
|
||||||
|
'reasoning/compassbenchv1_4_reasoning_zh_Social_sub',
|
||||||
|
],
|
||||||
|
'coding': [
|
||||||
|
'总分',
|
||||||
|
'中文总分',
|
||||||
|
'英文总分',
|
||||||
|
'coding/compassbenchv1_4_coding_en_sub',
|
||||||
|
'coding/compassbenchv1_4_coding_zh_sub',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class CompassBenchSummarizer:
|
class CompassBenchSummarizer:
|
||||||
@ -52,15 +85,18 @@ class CompassBenchSummarizer:
|
|||||||
self.base_models = self.cfg['datasets'][0]['base_models']
|
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||||
self.compare_models = self.cfg['eval']['partitioner']['models']
|
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||||
self.judge_models = self.cfg.get('judge_models', None)
|
self.judge_models = self.cfg.get('judge_models', None)
|
||||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
self.meta_judge_model = self.cfg.eval.partitioner.get(
|
||||||
|
'meta_judge_model', None)
|
||||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||||
self.judge_function = post_process_wildbench_pair
|
self.judge_function = post_process_wildbench_pair
|
||||||
self.check_pos_bias = check_pos_bias
|
self.check_pos_bias = check_pos_bias
|
||||||
|
|
||||||
def get_score(self, time_str):
|
def get_score(self, time_str):
|
||||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
model_combinations = list(product(self.base_models, self.compare_models))
|
model_combinations = list(
|
||||||
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
|
product(self.base_models, self.compare_models))
|
||||||
|
unique_combinations = remove_duplicate_pairs(
|
||||||
|
[combo for combo in model_combinations if combo[0] != combo[1]])
|
||||||
|
|
||||||
if self.meta_judge_model is not None:
|
if self.meta_judge_model is not None:
|
||||||
self.judge_models.append(self.meta_judge_model)
|
self.judge_models.append(self.meta_judge_model)
|
||||||
@ -71,33 +107,47 @@ class CompassBenchSummarizer:
|
|||||||
scores[judge_model] = {}
|
scores[judge_model] = {}
|
||||||
for dataset in self.cfg['datasets']:
|
for dataset in self.cfg['datasets']:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
dataset_root, dataset_detail = dataset_abbr.split('/')[0], dataset_abbr.split('/')[1]
|
dataset_root, dataset_detail = (
|
||||||
|
dataset_abbr.split('/')[0],
|
||||||
|
dataset_abbr.split('/')[1],
|
||||||
|
)
|
||||||
scores[judge_model][dataset_abbr] = {}
|
scores[judge_model][dataset_abbr] = {}
|
||||||
for model_pair in unique_combinations:
|
for model_pair in unique_combinations:
|
||||||
base_model = model_pair[0]['abbr']
|
base_model = model_pair[0]['abbr']
|
||||||
compare_model = model_pair[1]['abbr']
|
compare_model = model_pair[1]['abbr']
|
||||||
if idx == len(self.judge_models):
|
if idx == len(self.judge_models):
|
||||||
subdir = base_model + '_' + compare_model + '_summarized-by--' + judge_model
|
subdir = (base_model + '_' + compare_model +
|
||||||
|
'_summarized-by--' + judge_model)
|
||||||
else:
|
else:
|
||||||
subdir = base_model + '_' + compare_model + '_judged-by--' + judge_model
|
subdir = (base_model + '_' + compare_model +
|
||||||
|
'_judged-by--' + judge_model)
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
if not os.path.isdir(subdir_path):
|
if not os.path.isdir(subdir_path):
|
||||||
print(subdir_path + ' is not exist! please check!')
|
print(subdir_path + ' is not exist! please check!')
|
||||||
scores[judge_model][dataset_abbr][compare_model] = None
|
scores[judge_model][dataset_abbr][compare_model] = None
|
||||||
continue
|
continue
|
||||||
|
|
||||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
dataset, subdir_path, self.judge_function)
|
||||||
win_base_model = defaultdict(float)
|
win_base_model = defaultdict(float)
|
||||||
win_compare_model = defaultdict(float)
|
win_compare_model = defaultdict(float)
|
||||||
score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1}
|
score_mapping = {
|
||||||
|
'A++': 1,
|
||||||
|
'A+': 0.5,
|
||||||
|
'A=B': 0,
|
||||||
|
'B+': -0.5,
|
||||||
|
'B++': -1,
|
||||||
|
}
|
||||||
cnt = defaultdict(float)
|
cnt = defaultdict(float)
|
||||||
|
|
||||||
for judged_answer, reference in zip(judged_answers, references):
|
for judged_answer, reference in zip(
|
||||||
|
judged_answers, references):
|
||||||
if judged_answer not in score_mapping:
|
if judged_answer not in score_mapping:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
flag = 1 if reference['answer1'] == base_model else -1
|
flag = (1 if reference['answer1'] == base_model
|
||||||
score_1 = score_mapping[judged_answer]*flag
|
else -1)
|
||||||
|
score_1 = score_mapping[judged_answer] * flag
|
||||||
score_2 = -score_1
|
score_2 = -score_1
|
||||||
|
|
||||||
cnt[dataset_abbr] += 1
|
cnt[dataset_abbr] += 1
|
||||||
@ -107,10 +157,13 @@ class CompassBenchSummarizer:
|
|||||||
for key, value in cnt.items():
|
for key, value in cnt.items():
|
||||||
win_base_model[key] = win_base_model[key] / value * 100
|
win_base_model[key] = win_base_model[key] / value * 100
|
||||||
win_base_model[key] = round(win_base_model[key], 2)
|
win_base_model[key] = round(win_base_model[key], 2)
|
||||||
win_compare_model[key] = win_compare_model[key] / value * 100
|
win_compare_model[key] = (win_compare_model[key] /
|
||||||
win_compare_model[key ] = round(win_compare_model[key], 2)
|
value * 100)
|
||||||
|
win_compare_model[key] = round(win_compare_model[key],
|
||||||
|
2)
|
||||||
|
|
||||||
scores[judge_model][dataset_abbr][compare_model] = win_compare_model
|
scores[judge_model][dataset_abbr][
|
||||||
|
compare_model] = win_compare_model
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@ -131,7 +184,10 @@ class CompassBenchSummarizer:
|
|||||||
for judge_abbr, judge_scores in scores.items():
|
for judge_abbr, judge_scores in scores.items():
|
||||||
new_score = {}
|
new_score = {}
|
||||||
for dataset_name, model_scores in judge_scores.items():
|
for dataset_name, model_scores in judge_scores.items():
|
||||||
dataset_root, dataset_detail = dataset_name.split('/')[0], dataset_name.split('/')[1]
|
dataset_root, dataset_detail = (
|
||||||
|
dataset_name.split('/')[0],
|
||||||
|
dataset_name.split('/')[1],
|
||||||
|
)
|
||||||
if dataset_root not in new_score:
|
if dataset_root not in new_score:
|
||||||
new_score[dataset_root] = {}
|
new_score[dataset_root] = {}
|
||||||
if '_en_' in dataset_detail:
|
if '_en_' in dataset_detail:
|
||||||
@ -141,8 +197,10 @@ class CompassBenchSummarizer:
|
|||||||
if len(cate_score) == 0:
|
if len(cate_score) == 0:
|
||||||
new_score[dataset_root][model_name]['英文总分'] = None
|
new_score[dataset_root][model_name]['英文总分'] = None
|
||||||
else:
|
else:
|
||||||
new_score[dataset_root][model_name].update(cate_score)
|
new_score[dataset_root][model_name].update(
|
||||||
new_score[dataset_root][model_name]['英文总分'] = sum(cate_score.values()) / len(cate_score)
|
cate_score)
|
||||||
|
new_score[dataset_root][model_name]['英文总分'] = (
|
||||||
|
sum(cate_score.values()) / len(cate_score))
|
||||||
elif '_cn_' in dataset_detail or '_zh_' in dataset_detail:
|
elif '_cn_' in dataset_detail or '_zh_' in dataset_detail:
|
||||||
for model_name, cate_score in model_scores.items():
|
for model_name, cate_score in model_scores.items():
|
||||||
if model_name not in new_score[dataset_root]:
|
if model_name not in new_score[dataset_root]:
|
||||||
@ -150,17 +208,19 @@ class CompassBenchSummarizer:
|
|||||||
if len(cate_score) == 0:
|
if len(cate_score) == 0:
|
||||||
new_score[dataset_root][model_name]['中文总分'] = None
|
new_score[dataset_root][model_name]['中文总分'] = None
|
||||||
else:
|
else:
|
||||||
new_score[dataset_root][model_name].update(cate_score)
|
new_score[dataset_root][model_name].update(
|
||||||
new_score[dataset_root][model_name]['中文总分'] = sum(cate_score.values()) / len(cate_score)
|
cate_score)
|
||||||
|
new_score[dataset_root][model_name]['中文总分'] = (
|
||||||
|
sum(cate_score.values()) / len(cate_score))
|
||||||
for dataset, models in new_score.items():
|
for dataset, models in new_score.items():
|
||||||
for model, details in models.items():
|
for model, details in models.items():
|
||||||
if details['英文总分'] is not None and details['中文总分'] is not None:
|
if (details['英文总分'] is not None
|
||||||
|
and details['中文总分'] is not None):
|
||||||
average_score = (details['英文总分'] + details['中文总分']) / 2
|
average_score = (details['英文总分'] + details['中文总分']) / 2
|
||||||
else:
|
else:
|
||||||
average_score = None
|
average_score = None
|
||||||
details['总分'] = average_score
|
details['总分'] = average_score
|
||||||
|
|
||||||
|
|
||||||
df = pd.DataFrame()
|
df = pd.DataFrame()
|
||||||
# Iterate over the MAP and new_score to populate the DataFrame
|
# Iterate over the MAP and new_score to populate the DataFrame
|
||||||
for category, headers in MAP.items():
|
for category, headers in MAP.items():
|
||||||
@ -173,15 +233,17 @@ class CompassBenchSummarizer:
|
|||||||
category_data.append(row_data)
|
category_data.append(row_data)
|
||||||
|
|
||||||
# Create a DataFrame for the category and concatenate with the main DataFrame
|
# Create a DataFrame for the category and concatenate with the main DataFrame
|
||||||
new_headers = [category+'_'+item for item in headers]
|
new_headers = [category + '_' + item for item in headers]
|
||||||
category_df = pd.DataFrame(category_data, columns=[category] + new_headers)
|
category_df = pd.DataFrame(category_data,
|
||||||
|
columns=[category] + new_headers)
|
||||||
df = pd.concat([df, category_df.set_index(category)], axis=1)
|
df = pd.concat([df, category_df.set_index(category)], axis=1)
|
||||||
|
|
||||||
df_transposed = df.T
|
df_transposed = df.T
|
||||||
|
|
||||||
|
output_filename = osp.join(
|
||||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + '-report.csv')
|
output_dir,
|
||||||
|
'summarized-by--' + judge_abbr + '-' + '-report.csv',
|
||||||
|
)
|
||||||
|
|
||||||
transposed_csv_file_path = output_filename
|
transposed_csv_file_path = output_filename
|
||||||
df_transposed.to_csv(transposed_csv_file_path)
|
df_transposed.to_csv(transposed_csv_file_path)
|
||||||
|
@ -71,6 +71,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
|||||||
f'答案应该?是\s*([{options}])',
|
f'答案应该?是\s*([{options}])',
|
||||||
f'答案应该?选\s*([{options}])',
|
f'答案应该?选\s*([{options}])',
|
||||||
f'答案选项为?\s*:\s*([{options}])',
|
f'答案选项为?\s*:\s*([{options}])',
|
||||||
|
f'答案选项为?\s+\(?\*?\*?([{options}])\*?\*?\)?',
|
||||||
f'答案选项是?\s*:\s*([{options}])',
|
f'答案选项是?\s*:\s*([{options}])',
|
||||||
f'答案为\s*([{options}])',
|
f'答案为\s*([{options}])',
|
||||||
f'答案选\s*([{options}])',
|
f'答案选\s*([{options}])',
|
||||||
@ -100,6 +101,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
|||||||
f'答案为\s?(\S+)(?:。|$)',
|
f'答案为\s?(\S+)(?:。|$)',
|
||||||
f'(?i)ANSWER\s*:\s*([{options}])',
|
f'(?i)ANSWER\s*:\s*([{options}])',
|
||||||
f'[Tt]he answer is:?\s+\(?([{options}])\)?',
|
f'[Tt]he answer is:?\s+\(?([{options}])\)?',
|
||||||
|
f'[Tt]he answer is:?\s+\(?\*?\*?([{options}])\*?\*?\)?',
|
||||||
f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
|
f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
|
||||||
f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
|
f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
|
||||||
f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
|
f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
|
||||||
|
Loading…
Reference in New Issue
Block a user