mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* Create eval_longbenchv2.py * Create longbenchv2_gen.py * Update __init__.py * Create longbenchv2.py * Update datasets_info.py * update * update * update * update * update * update --------- Co-authored-by: abrohamLee <146956824+abrohamLee@users.noreply.github.com>
124 lines
3.9 KiB
Python
124 lines
3.9 KiB
Python
from datasets import Dataset, load_dataset
|
|
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class LongBenchv2Dataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(path: str):
|
|
path = get_data_path(path)
|
|
dataset = load_dataset('json', data_files=path)
|
|
|
|
split = 'train'
|
|
raw_data = []
|
|
for i in range(len(dataset[split])):
|
|
question = dataset[split]['question'][i]
|
|
context = dataset[split]['context'][i]
|
|
answer = dataset[split]['answer'][i]
|
|
choice_A = dataset[split]['choice_A'][i]
|
|
choice_B = dataset[split]['choice_B'][i]
|
|
choice_C = dataset[split]['choice_C'][i]
|
|
choice_D = dataset[split]['choice_D'][i]
|
|
difficulty = dataset[split]['difficulty'][i]
|
|
length = dataset[split]['length'][i]
|
|
raw_data.append({
|
|
'question': question,
|
|
'context': context,
|
|
'answer': answer,
|
|
'choice_A': choice_A,
|
|
'choice_B': choice_B,
|
|
'choice_C': choice_C,
|
|
'choice_D': choice_D,
|
|
'difficulty': difficulty,
|
|
'length': length
|
|
})
|
|
dataset['test'] = Dataset.from_list(raw_data)
|
|
return dataset
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchv2Evaluator(BaseEvaluator):
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def score(self, predictions, references, test_set):
|
|
if not test_set:
|
|
raise ValueError('test set is empty')
|
|
|
|
metrics = {
|
|
'total': {
|
|
'correct': 0,
|
|
'total': 0
|
|
},
|
|
'difficulty': {
|
|
'easy': {
|
|
'correct': 0,
|
|
'total': 0
|
|
},
|
|
'hard': {
|
|
'correct': 0,
|
|
'total': 0
|
|
}
|
|
},
|
|
'length': {
|
|
'short': {
|
|
'correct': 0,
|
|
'total': 0
|
|
},
|
|
'medium': {
|
|
'correct': 0,
|
|
'total': 0
|
|
},
|
|
'long': {
|
|
'correct': 0,
|
|
'total': 0
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, (pred, ref,
|
|
sample) in enumerate(zip(predictions, references, test_set)):
|
|
is_correct = (pred == ref)
|
|
|
|
metrics['total']['total'] += 1
|
|
if is_correct:
|
|
metrics['total']['correct'] += 1
|
|
|
|
difficulty = sample.get('difficulty', 'unknown')
|
|
if difficulty in metrics['difficulty']:
|
|
metrics['difficulty'][difficulty]['total'] += 1
|
|
if is_correct:
|
|
metrics['difficulty'][difficulty]['correct'] += 1
|
|
|
|
length = sample.get('length', 'unknown')
|
|
if length in metrics['length']:
|
|
metrics['length'][length]['total'] += 1
|
|
if is_correct:
|
|
metrics['length'][length]['correct'] += 1
|
|
|
|
results = {
|
|
'accuracy':
|
|
metrics['total']['correct'] / metrics['total']['total'] * 100
|
|
}
|
|
|
|
for diff in ['easy', 'hard']:
|
|
if metrics['difficulty'][diff]['total'] > 0:
|
|
acc = metrics['difficulty'][diff]['correct'] / metrics[
|
|
'difficulty'][diff]['total'] * 100
|
|
results[f'accuracy_{diff}'] = acc
|
|
|
|
for length in ['short', 'medium', 'long']:
|
|
if metrics['length'][length]['total'] > 0:
|
|
acc = metrics['length'][length]['correct'] / metrics['length'][
|
|
length]['total'] * 100
|
|
results[f'accuracy_{length}'] = acc
|
|
|
|
return results
|