mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* update * update * update * update * update * update * update * updaste * update * update * updaste * updaste * update * update * update * update * update * update * update * update
384 lines
16 KiB
Python
384 lines
16 KiB
Python
import csv
|
|
import os
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
output_path = 'regression_result_daily'
|
|
|
|
|
|
def model_list(type):
|
|
config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
|
|
with open(config_path) as f:
|
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
|
return config.get(type).keys()
|
|
|
|
|
|
def dataset_list(model, type):
|
|
config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
|
|
with open(config_path) as f:
|
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
|
return config.get(model).get(type).keys()
|
|
|
|
|
|
@pytest.fixture()
|
|
def baseline_scores_testrange(request):
|
|
config_path = os.path.join(
|
|
request.config.rootdir,
|
|
'.github/scripts/oc_score_baseline_testrange.yaml')
|
|
with open(config_path) as f:
|
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
|
return config
|
|
|
|
|
|
@pytest.fixture()
|
|
def baseline_scores(request):
|
|
config_path = os.path.join(request.config.rootdir,
|
|
'.github/scripts/oc_score_baseline.yaml')
|
|
with open(config_path) as f:
|
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
|
return config
|
|
|
|
|
|
@pytest.fixture()
|
|
def baseline_scores_fullbench(request):
|
|
config_path = os.path.join(
|
|
request.config.rootdir,
|
|
'.github/scripts/oc_score_baseline_fullbench.yaml')
|
|
with open(config_path) as f:
|
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
|
return config
|
|
|
|
|
|
@pytest.fixture()
|
|
def result_scores():
|
|
file = find_csv_files(output_path)
|
|
if file is None:
|
|
return None
|
|
return read_csv_file(file)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_testrange')
|
|
@pytest.mark.chat_models
|
|
class TestChat:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize(
|
|
'model, dataset', [(p1, p2) for p1 in model_list('chat')
|
|
for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
|
|
def test_model_dataset_score(self, baseline_scores_testrange,
|
|
result_scores, model, dataset):
|
|
base_score = baseline_scores_testrange.get('chat').get(model).get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_testrange')
|
|
@pytest.mark.base_models
|
|
class TestBase:
|
|
"""Test cases for base model."""
|
|
|
|
@pytest.mark.parametrize('model, dataset',
|
|
[(p1, p2) for p1 in model_list('base') for p2 in [
|
|
'gsm8k_accuracy', 'GPQA_diamond_accuracy',
|
|
'race-high_accuracy', 'winogrande_accuracy'
|
|
]])
|
|
def test_model_dataset_score(self, baseline_scores_testrange,
|
|
result_scores, model, dataset):
|
|
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
|
|
] and dataset != 'gsm8k_accuracy':
|
|
return
|
|
base_score = baseline_scores_testrange.get('base').get(model).get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
|
@pytest.mark.chat_obj_fullbench
|
|
class TestChatObjFullbench:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
|
'internlm2_5-7b-chat-hf_fullbench',
|
|
'internlm2_5-7b-chat-turbomind_fullbench'
|
|
] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
|
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
|
result_scores, model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
|
@pytest.mark.chat_sub_fullbench
|
|
class TestChatSubFullbench:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
|
'internlm2_5-7b-chat-hf_fullbench',
|
|
'internlm2_5-7b-chat-turbomind_fullbench'
|
|
] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
|
|
)
|
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
|
result_scores, model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get(
|
|
'subjective').get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
|
@pytest.mark.base_fullbench
|
|
class TestBaseFullbench:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[(p1, p2) for p1 in
|
|
['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
|
|
for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
|
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
|
result_scores, model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores')
|
|
@pytest.mark.api
|
|
class TestApibench:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize('model, dataset',
|
|
[('lmdeploy-api-test', 'race-middle_accuracy'),
|
|
('lmdeploy-api-test', 'race-high_accuracy'),
|
|
('lmdeploy-api-test', 'gsm8k_accuracy')])
|
|
def test_api(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
|
@pytest.mark.volc_fullbench
|
|
class TestVolcFullbench:
|
|
"""Test cases for chat model."""
|
|
|
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
|
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
|
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
|
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
|
] for p2 in dataset_list(p1, 'objective')])
|
|
@pytest.mark.chat_objective
|
|
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
|
model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.parametrize('model, dataset', [
|
|
(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
|
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
|
|
])
|
|
@pytest.mark.chat_subjective
|
|
def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
|
|
model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get(
|
|
'subjective').get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
|
|
for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
|
|
@pytest.mark.base_objective
|
|
def test_base_objective(self, baseline_scores_fullbench, result_scores,
|
|
model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
|
dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
|
|
for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
|
|
@pytest.mark.base_long_context
|
|
def test_base_long_context(self, baseline_scores_fullbench, result_scores,
|
|
model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get(
|
|
'long_context').get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[(p1, p2)
|
|
for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
|
|
'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
|
|
@pytest.mark.chat_long_context
|
|
def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
|
|
model, dataset):
|
|
base_score = baseline_scores_fullbench.get(model).get(
|
|
'long_context').get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
|
|
@pytest.mark.usefixtures('result_scores')
|
|
@pytest.mark.usefixtures('baseline_scores')
|
|
class TestCmdCase:
|
|
|
|
@pytest.mark.case1
|
|
@pytest.mark.parametrize('model, dataset',
|
|
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
|
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
|
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
|
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
@pytest.mark.case2
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
|
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
|
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
|
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
|
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
|
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
|
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.case3
|
|
@pytest.mark.parametrize('model, dataset',
|
|
[('internlm2_5-7b_hf', 'race-middle_accuracy'),
|
|
('internlm2_5-7b_hf', 'race-high_accuracy'),
|
|
('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
|
|
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model, result_score, base_score, dataset)
|
|
|
|
@pytest.mark.case4
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
|
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
|
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
|
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
@pytest.mark.case5
|
|
@pytest.mark.parametrize(
|
|
'model, dataset',
|
|
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
|
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
|
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
|
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
|
base_score = baseline_scores.get(model).get(dataset)
|
|
result_score = result_scores.get(model).get(dataset)
|
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
|
|
|
|
|
def assert_score(model_type, score, baseline, dataset: str = ''):
|
|
if score is None or score == '-':
|
|
assert False, 'value is none'
|
|
|
|
if 'batch' not in model_type:
|
|
if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
|
|
0.01):
|
|
print(' '.join([score, 'is equal', str(baseline)]))
|
|
assert True
|
|
else:
|
|
print(' '.join([score, 'is not equal', str(baseline)]))
|
|
assert False, ' '.join([score, 'is not equal', str(baseline)])
|
|
else:
|
|
if dataset.startswith('dingo') or dataset.startswith(
|
|
'GPQA') or dataset.startswith('high') or dataset.startswith(
|
|
'mmlu_pro_') or dataset.startswith(
|
|
'alpaca_eval') or dataset.startswith('compassarena_'):
|
|
threshold = 5
|
|
elif dataset.startswith('humanevalx') or dataset == 'large_threshold':
|
|
threshold = 10
|
|
else:
|
|
threshold = 3
|
|
if float(score) <= (baseline + threshold) and float(score) >= (
|
|
baseline - threshold):
|
|
print(' '.join([
|
|
score, 'is between',
|
|
str(baseline - threshold), 'and',
|
|
str(baseline + threshold)
|
|
]))
|
|
assert True
|
|
else:
|
|
print(' '.join([
|
|
score, 'is not between',
|
|
str(baseline - threshold), 'and',
|
|
str(baseline + threshold)
|
|
]))
|
|
assert False, ' '.join([
|
|
score, 'is not between',
|
|
str(baseline - threshold), 'and',
|
|
str(baseline + threshold)
|
|
])
|
|
|
|
|
|
def find_csv_files(directory):
|
|
csv_files = []
|
|
for root, dirs, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith('.csv') and file.startswith('summary'):
|
|
csv_files.append(os.path.join(root, file))
|
|
|
|
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
|
|
sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1])
|
|
latest_csv_file = sorted_csv_files[-1][0]
|
|
return latest_csv_file
|
|
|
|
|
|
def read_csv_file(file_path):
|
|
with open(file_path, 'r') as csvfile:
|
|
reader = csv.DictReader(csvfile)
|
|
filtered_data = []
|
|
for row in reader:
|
|
if row['metric'] is not None and 'bpb' not in row[
|
|
'metric'] and '_' != row['metric']:
|
|
filtered_row = row
|
|
filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
|
|
del filtered_row['version']
|
|
del filtered_row['metric']
|
|
del filtered_row['mode']
|
|
filtered_data.append(filtered_row)
|
|
|
|
result = {}
|
|
for data in filtered_data:
|
|
dataset = data.get('dataset')
|
|
for key in data.keys():
|
|
if key == 'dataset':
|
|
continue
|
|
else:
|
|
if key in result.keys():
|
|
result.get(key)[dataset] = data.get(key)
|
|
else:
|
|
result[key] = {dataset: data.get(key)}
|
|
return result
|