[Feature] Add Korbench dataset (#1713)

* first version for korbench

* first stage for korbench

* korbench_1

* korbench_1

* korbench_1

* korbench_1

* korbench_1_revised

* korbench_combined_1

* korbench_combined_1

* kor_combined

* kor_combined

* update

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
This commit is contained in:
Yufeng Zhao 2024-11-25 20:11:27 +08:00 committed by GitHub
parent 5c1916ea4c
commit 300adc31e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 1814 additions and 0 deletions

9
configs/eval_korbench.py Normal file
View File

@ -0,0 +1,9 @@
from mmengine import read_base
with read_base():
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import korbench_0shot_single_datasets as zero_shot_datasets
from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import korbench_3shot_single_datasets as three_shot_datasets
from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import korbench_mixed_datasets as mixed_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import models as hf_internlm2_5_7b
datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
models = hf_internlm2_5_7b

View File

@ -0,0 +1,59 @@
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
korbench_mixed_datasets = []
categories = ["Multi-Q", "Multi-R", "Multi-RQ"] # Define available modes for mixed mode
for category in categories:
# Prompt template
prompt_template = dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role="HUMAN",
prompt=""
)
],
round=[
dict(
role="HUMAN",
prompt="{prompt}" # f-string
)
]
)
)
# Reader configuration
reader_cfg = dict(
input_columns=["prompt"],
output_column="answer",
)
# Inference configuration
infer_cfg = dict(
prompt_template=prompt_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=korbenchEvaluator),
pred_role="BOT",
)
korbench_dataset = dict(
type=korbenchDataset,
abbr=f"korbench_mixed_{category}",
path="opencompass/korbench",
category=category,
mode='mixed',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
korbench_mixed_datasets.append(korbench_dataset)

View File

@ -0,0 +1,60 @@
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
categories = ["cipher", "counterfactual", "logic", "operation", "puzzle"]
korbench_0shot_single_datasets = []
for category in categories:
# Prompt template
prompt_template = dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role="HUMAN",
prompt=""
)
],
round=[
dict(
role="HUMAN",
prompt="{prompt}" # f-string
)
]
)
)
# Reader configuration
reader_cfg = dict(
input_columns=["prompt"],
output_column="answer",
)
# Inference configuration
infer_cfg = dict(
prompt_template=prompt_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=korbenchEvaluator),
pred_role="BOT",
)
korbench_dataset = dict(
type=korbenchDataset,
abbr=f"korbench_{category}_0shot",
path="opencompass/korbench",
mode='0_shot',
category=category,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
korbench_0shot_single_datasets.append(korbench_dataset)

View File

@ -0,0 +1,61 @@
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
categories = ["cipher", "counterfactual", "logic", "operation", "puzzle"]
korbench_3shot_single_datasets = []
for category in categories:
# Prompt template
prompt_template = dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role="HUMAN",
prompt=""
)
],
round=[
dict(
role="HUMAN",
prompt="{prompt}" # f-string
)
]
)
)
# Reader configuration
reader_cfg = dict(
input_columns=["prompt"],
output_column="answer",
)
# Inference configuration
infer_cfg = dict(
prompt_template=prompt_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=korbenchEvaluator),
pred_role="BOT",
)
korbench_dataset = dict(
type=korbenchDataset,
abbr=f"korbench_{category}_3shot",
path="opencompass/korbench",
mode='3_shot',
category=category,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
korbench_3shot_single_datasets.append(korbench_dataset)

View File

@ -0,0 +1,5 @@
korbench_summary_groups = []
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
mixed_categories = ['Multi-Q', 'Multi-R', 'Multi-RQ']
korbench_summary_groups.append({'name': 'korbench_single', 'subsets': [f'korbench_{c}' for c in categories]})
korbench_summary_groups.append({'name': 'korbench_mixed', 'subsets': [f'korbench_{c}' for c in mixed_categories]})

View File

@ -65,6 +65,7 @@ from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403
from .jsonl import JsonlDataset # noqa: F401, F403
from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403
from .korbench import * # noqa: F401, F403
from .lambada import * # noqa: F401, F403
from .lawbench import * # noqa: F401, F403
from .LCBench import * # noqa: F401, F403

View File

@ -0,0 +1,215 @@
import os
from datasets import Dataset
from opencompass.datasets.korbench.korbench_utils import (
evaluate_responses, find_file, load_json_or_jsonl,
load_json_or_jsonl_with_idx, load_yaml)
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class korbenchDataset(BaseDataset):
"""Dataset loader for the task in KOR-Bench."""
@staticmethod
def load(path, mode, category):
"""Load the dataset using shared ."""
base_path = get_data_path(path)
rule_file = None
sample_file = None
mixed_file = None
mixed_data = None
if '0_shot' in mode or '3_shot' in mode:
rule_file = find_file(base_path, os.path.join(category, 'rule'))
sample_file = find_file(base_path,
os.path.join(category, 'sample'))
elif mode == 'mixed':
mixed_file = find_file(base_path, os.path.join('mixed', category))
mixed_data = load_json_or_jsonl(mixed_file) or []
else:
raise ValueError(f'Unsupported mode: {mode}')
three_shot_file = None
if mode == '3_shot':
ts_path = os.path.join(category, 'three-shot')
three_shot_file = find_file(base_path, ts_path)
# Load data
if mode in ['0_shot', '3_shot']:
rules = load_json_or_jsonl(rule_file) or []
samples = load_json_or_jsonl(sample_file) or []
template_path = None
if mode == '0_shot':
template_path = os.path.join(
os.path.dirname(__file__),
'korbench_dataset_config/prompt/0_shot.yaml')
elif mode == '3_shot':
template_path = os.path.join(
os.path.dirname(__file__),
'korbench_dataset_config/prompt/3_shot.yaml')
elif mode == 'mixed':
template_path = os.path.join(
os.path.dirname(__file__),
'korbench_dataset_config/prompt/mixed.yaml')
try:
template = load_yaml(template_path)
except FileNotFoundError:
print(f'[ERROR] Missing prompt template: {template_path}')
return Dataset.from_list([])
# Process data
data = []
if mode == '0_shot':
for sample in samples:
rule_id = sample['rule_id']
rule = next((r for r in rules if r['idx'] == rule_id), None)
if not rule:
print(f"[WARNING] Rule ID {sample['rule_id']} not found."
'Skipping...')
continue
prompt_key = f'{category}_prompt_format'
prompt = template[prompt_key][0].format(
rule['rule_content'], sample['question'])
# Add processed item
data.append({
'rule_content': rule['rule_content'],
'question': sample['question'],
'answer': sample['answer'],
'prompt': prompt,
'rule_id': rule['idx'],
'mode': '0_shot',
'category': category,
})
return Dataset.from_list(data)
if mode == '3_shot':
data = []
three_shot = load_json_or_jsonl(three_shot_file) or []
for sample in samples:
rule_id = sample['rule_id']
rule = next((r for r in rules if r['idx'] == rule_id), None)
three_shot_qa = [
item for fs in three_shot if fs['rule_id'] == rule_id
for item in [fs['question'], fs['answer']]
]
if not rule:
print(f"[WARNING] Rule ID {sample['rule_id']} not found."
'Skipping...')
continue
prompt_key = f'{category}_prompt_format'
prompt = template[prompt_key][0].format(
rule['rule_content'], *three_shot_qa, sample['question'])
# Add processed item
data.append({
'rule_content': rule['rule_content'],
'question': sample['question'],
'answer': sample['answer'],
'prompt': prompt,
'rule_id': rule['idx'],
'mode': '3_shot',
'category': category,
})
return Dataset.from_list(data)
if mode == 'mixed':
# Process data
data = []
for item in mixed_data:
rule_list = item['rule_list']
question_list = item['question_list']
rule_content_list = []
question_content_list = []
# Fetch rules and questions
for rule in rule_list:
category, rule_idx = rule.rsplit('_', 1)
rule_content = load_json_or_jsonl_with_idx(base_path,
os.path.join(
category,
'rule'),
idx=rule_idx)
rule_content_list.append(rule_content['rule_content'])
for question in question_list:
category, question_idx = question.rsplit('_', 1)
question_content = load_json_or_jsonl_with_idx(
base_path,
os.path.join(category, 'sample'),
idx=question_idx)
question_content_list.append(question_content['question'])
# Prepare prompt
rules_str = '\n'.join(
f'Rule {i+1}: {content}'
for i, content in enumerate(rule_content_list))
questions_str = '\n'.join(
f'Question {i+1}: {content}'
for i, content in enumerate(question_content_list))
prompt_format = [rules_str, questions_str]
prompt = template['prompt_format'][0].format(*prompt_format)
# Add processed item
data.append({
'rule_list': rule_list,
'question_list': question_list,
'prompt': prompt,
'mode': 'mixed',
'answer': '',
'base_path': base_path,
})
return Dataset.from_list(data)
@ICL_EVALUATORS.register_module()
class korbenchEvaluator(BaseEvaluator):
def __init__(self):
super().__init__()
def score(self, predictions, references, test_set):
"""Evaluate predictions for a single mode in KOR-Bench."""
if not test_set:
raise ValueError('Test set is empty.')
mode = test_set[0]['mode'] # Determine the mode from the first entry
data = {}
# Organize data for the given mode
for i in range(len(predictions)):
entry = {
'prediction': predictions[i],
'gold': references[i],
'rule_id': test_set[i].get('rule_id', None),
'category': test_set[i].get('category', None),
'rule_list': test_set[i].get('rule_list', None),
'question_list': test_set[i].get('question_list', None),
'base_path': test_set[i].get('base_path', None),
}
data[i] = entry
if not data:
raise ValueError(f"No data found for mode '{mode}'")
# Evaluate based on the mode
if mode == '0_shot':
evaluation_results = evaluate_responses(data, '0_shot')
elif mode == '3_shot':
evaluation_results = evaluate_responses(data, '3_shot')
elif mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
evaluation_results = evaluate_responses(data, 'mixed',
test_set[0]['base_path'])
else:
raise ValueError(f'Unsupported mode: {mode}')
# Calculate accuracy
correct_count = sum(res['is_correct'] for res in evaluation_results)
accuracy = (correct_count / len(evaluation_results)) * 100
# Return scores
return {'accuracy': accuracy}

View File

@ -0,0 +1,15 @@
# Necessary
response_key: 'response'
error_key: 'error'
id_key:
- 'idx'
- 'step'
prompt_key: 'prompt'
# Optional
history_key: 'history'
status_key: 'status'
save_prompt: True
max_tokens: 2000
max_rounds: 5

View File

@ -0,0 +1,90 @@
import yaml
class ConfigWrapper:
def __init__(self, config_path):
self._config = {}
with open(config_path, 'r') as file:
self._config = yaml.safe_load(file)
for key, value in self._config.items():
setattr(self, key, value)
def __setattr__(self, key, value):
if key.startswith('_'):
super().__setattr__(key, value)
else:
self._config[key] = value
super().__setattr__(key, value)
def __getattr__(self, key):
if key in self._config:
return self._config[key]
raise AttributeError(
f"'ConfigWrapper' object has no attribute '{key}'")
def get_id(self, data):
if isinstance(self._config.get('id_key'), str):
return data.get(self._config.get('id_key'), None)
elif isinstance(self._config.get('id_key'), list):
return '_'.join([
str(data[key]) for key in self._config.get('id_key')
if key in data
])
def print_all_keys(self):
print('config keys:')
for key, value in self._config.items():
print(f' - {key}: {value}')
config_wrapper = None
def initialize_config(config_path):
global config_wrapper
config_wrapper = ConfigWrapper(config_path)
def get_config_wrapper():
global config_wrapper
if config_wrapper is None:
raise RuntimeError(
'ConfigWrapper not initialized. Call initialize_config first.')
return config_wrapper
if __name__ == '__main__':
config_path = 'config/config.yaml'
initialize_config(config_path)
data = {
'idx':
'50',
'step':
21,
'question':
('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
'Please provide the decrypted answer, '
'encapsulated in double square brackets. '
'For example, the format should be: [[decrypted answer]].'),
'answer':
'[[P]]',
'category':
'Decryption',
'rule_id':
'23',
'input':
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
'steps_num':
23,
'description':
('For a number c=228 in the ciphertext:\nCalculate z = c^e mod n.'
' Here ^ means multiplication.\nz is 80.\n'
'Based on the decimal number represented by z, '
'use the ascii code to find the corresponding'
' letter as the plaintext letter p.\n'
'Please give the letter p in [[...]] format.\n'),
'atom':
80
}
print(config_wrapper.get_id(data))

View File

@ -0,0 +1,94 @@
cipher_prompt_format:
- |
You are an intelligent assistant that specializes in encryption and decryption tasks. Below are the rules for a specific cipher. When responding, please ensure that your output adheres to the specified encryption and decryption rules and format.
### Instructions:
1. Identify the relevant properties and objects specified in the rule, including the plaintext, keyword, and ciphertext.
2. Follow the specified encryption or decryption operations precisely as described in the rules.
3. Ensure your output is formatted according to the specified notation and symbols.
### Cipher Rule:
{}
### Question:
{}
### Answer:
counterfactual_prompt_format:
- |
You are an advanced assistant with expertise in storytelling and rule-based reasoning. Your task is to carefully analyze the provided story, which includes specific rules and details, and use this information to accurately answer related questions.
### Instructions:
1. Thoroughly review the story to identify and understand the relevant details and rules.
2. Use the context provided by the story to offer precise and insightful answers.
3. Ensure your responses align with the rules and information given in the story.
### Story Rule:
{}
### Question:
{}
### Answer:
logic_prompt_format:
- |
You are an intelligent assistant that helps with various logical reasoning tasks. Below is a custom-defined rule for a specific type of logic. When responding, please ensure that your output adheres to the specified logical rules and format.
### Instructions:
1. Identify the relevant properties and objects as specified in the rule.
2. Apply the given logical operations or reasoning patterns.
3. Ensure your output is formatted according to the specified notation and symbols.
### Logic Rule:
{}
### Question:
{}
### Answer:
operation_prompt_format:
- |
You are an intelligent assistant specializing in evaluating custom operations. Below is a specific rule defined for a custom operation. Your task is to apply this rule accurately to the provided question.
### Instructions:
1. Carefully read and understand the definitions of the new operations in the rule.
2. If the question does not specifically ask for it, your answer should be a number or a group of numbers.
3. Double-check your final answer to ensure it follows the rule accurately.
### Operation Rule:
{}
### Question:
{}
### Answer:
puzzle_prompt_format:
- |
You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question.
### Instructions:
1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps.
2. Apply the rule carefully to address the question presented.
3. Verify your answer to ensure it aligns with the rule and the context of the puzzle.
### Puzzle Rule:
{}
### Question:
{}
### Answer:

View File

@ -0,0 +1,184 @@
cipher_prompt_format:
- |
You are an intelligent assistant that specializes in encryption and decryption tasks. Below are the rules for a specific cipher. When responding, please ensure that your output adheres to the specified encryption and decryption rules and format.
### Instructions:
1. Identify the relevant properties and objects specified in the rule, including the plaintext, keyword, and ciphertext.
2. Follow the specified encryption or decryption operations precisely as described in the rules.
3. Ensure your output is formatted according to the specified notation and symbols.
### Cipher Rule:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
counterfactual_prompt_format:
- |
You are an advanced assistant with expertise in storytelling and rule-based reasoning. Your task is to carefully analyze the provided story, which includes specific rules and details, and use this information to accurately answer related questions.
### Instructions:
1. Thoroughly review the story to identify and understand the relevant details and rules.
2. Use the context provided by the story to offer precise and insightful answers.
3. Ensure your responses align with the rules and information given in the story.
### Story Rule:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
logic_prompt_format:
- |
You are an intelligent assistant that helps with various logical reasoning tasks. Below is a custom-defined rule for a specific type of logic. When responding, please ensure that your output adheres to the specified logical rules and format.
### Instructions:
1. Identify the relevant properties and objects as specified in the rule.
2. Apply the given logical operations or reasoning patterns.
3. Ensure your output is formatted according to the specified notation and symbols.
### Logic Rule:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
operation_prompt_format:
- |
You are an intelligent assistant specializing in evaluating custom operations. Below is a specific rule defined for a custom operation. Your task is to apply this rule accurately to the provided question.
### Instructions:
1. Carefully read and understand the definitions of the new operations in the rule.
2. If the question does not specifically ask for it, your answer should be a number or a group of numbers.
3. Double-check your final answer to ensure it follows the rule accurately.
### Operation Rule:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
puzzle_prompt_format:
- |
You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question.
### Instructions:
1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps.
2. Apply the rule carefully to address the question presented.
3. Verify your answer to ensure it aligns with the rule and the context of the puzzle.
### Puzzle Rule:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:
{}
### Question:
{}
### Answer:

View File

@ -0,0 +1,22 @@
prompt_format:
- |
You are an intelligent assistant capable of handling all types of reasoning and problem-solving tasks. Below is the text of a set of rules. Your task is to apply the appropriate rules to solve a series of problems.
### Instructions:
1. Read each question carefully and rules to find something relevant to that question.
2. Use the relevant rules to answer each question accurately.
3. Provide the final answers to all questions in JSON format.
{{
"question1": "your answer",
"question2": "your answer",
"question3": "your answer",
}}
### Rules:
{}
### Questions:
{}
### Answers:

View File

@ -0,0 +1,3 @@
prompt_format:
- |
Your answer is incorrect, please check your answer and provide a correct one.

View File

@ -0,0 +1,20 @@
prompt_format:
- |
You are an intelligent assistant specializing in solving custom puzzle problems. Below is a specific rule defined for a custom puzzle. Your task is to apply this rule accurately to the provided question.
### Instructions:
1. Thoroughly understand the rule provided. If needed, break down the rule into simpler components or steps.
2. Apply the rule carefully to address the question presented.
3. Verify your answer to ensure it aligns with the rule and the context of the puzzle.
### Puzzle Rule:
{}
### Question:
{}
{}
### Answer:

View File

@ -0,0 +1,699 @@
import json
import os
import re
import sympy as sp
import yaml
from sympy.parsing.latex import parse_latex
def load_yaml(yaml_path):
"""Load a YAML file."""
if not os.path.exists(yaml_path):
raise FileNotFoundError(f'YAML file not found: {yaml_path}')
with open(yaml_path, 'r', encoding='utf-8') as file:
return yaml.safe_load(file)
def load_json_or_jsonl(file_path):
"""Load data from a JSON or JSONL file."""
if not os.path.exists(file_path):
return None
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
return json.load(file)
elif file_path.endswith('.jsonl'):
return [json.loads(line) for line in file]
return None
def find_file(base_path, sub_path, extensions=('json', 'jsonl')):
"""Find the first available file with given extensions."""
for ext in extensions:
file_path = os.path.join(base_path, f'{sub_path}.{ext}')
if os.path.exists(file_path):
return file_path
return None
def load_json_or_jsonl_with_idx(data_path, split='', idx=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if idx is not None:
try:
return next(item for item in data if item.get('idx') == idx)
except StopIteration:
raise ValueError(f'No entry found for idx {idx}')
else:
return data
def load_split_data(base_path, split_name):
"""Load the rule and sample data for a specific split."""
split_path = os.path.join(base_path, split_name)
rule_path = find_file(split_path, 'rule')
sample_path = find_file(split_path, 'sample')
rules = load_json_or_jsonl(rule_path) if rule_path else []
samples = load_json_or_jsonl(sample_path) if sample_path else []
return {'rules': rules, 'samples': samples}
def process_mixed_data(base_path, mode):
"""Load and process data for the 'mixed' split and specific mode."""
mixed_path = os.path.join(base_path, 'mixed')
file_path = find_file(mixed_path, mode)
if not file_path:
print(f'[WARNING] Missing file for mixed mode: {mode}')
return []
data = load_json_or_jsonl(file_path)
template_path = os.path.join(base_path, 'config/prompt/mixed.yaml')
template = load_yaml(template_path)
processed = []
for item in data:
rules = '\n'.join(item.get('rule_list', []))
questions = '\n'.join(item.get('question_list', []))
item['prompt'] = template['prompt_format'][0].format(rules, questions)
processed.append(item)
return processed
class ConfigWrapper:
def __init__(self, config_path):
self._config = {}
with open(config_path, 'r') as file:
self._config = yaml.safe_load(file)
for key, value in self._config.items():
setattr(self, key, value)
def __setattr__(self, key, value):
if key.startswith('_'):
super().__setattr__(key, value)
else:
self._config[key] = value
super().__setattr__(key, value)
def __getattr__(self, key):
if key in self._config:
return self._config[key]
raise AttributeError(
f"'ConfigWrapper' object has no attribute '{key}'")
def get_id(self, data):
if isinstance(self._config.get('id_key'), str):
return data.get(self._config.get('id_key'), None)
elif isinstance(self._config.get('id_key'), list):
return '_'.join([
str(data[key]) for key in self._config.get('id_key')
if key in data
])
def print_all_keys(self):
print('config keys:')
for key, value in self._config.items():
print(f' - {key}: {value}')
config_wrapper = None
def initialize_config(config_path):
global config_wrapper
config_wrapper = ConfigWrapper(config_path)
def get_config_wrapper():
global config_wrapper
if config_wrapper is None:
raise RuntimeError(
'ConfigWrapper not initialized. Call initialize_config first.')
return config_wrapper
if __name__ == '__main__':
config_path = 'config/config.yaml'
initialize_config(config_path)
data = {
'idx':
'50',
'step':
21,
'question':
('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
'Please provide the decrypted answer, encapsulated in double '
'square brackets. '
'For example, the format should be: [[decrypted answer]].'),
'answer':
'[[P]]',
'category':
'Decryption',
'rule_id':
'23',
'input':
'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
'steps_num':
23,
'description':
('For a number c=228 in the ciphertext:\n'
'Calculate z = c^e mod n. Here ^ means multiplication.\n'
'z is 80.\nBased on the decimal number represented by z, '
'use the ascii code to find the corresponding letter '
'as the plaintext letter p.\n'
'Please give the letter p in [[...]] format.\n'),
'atom':
80
}
print(config_wrapper.get_id(data))
def read_yaml(config='default'):
if os.path.exists(f'config/prompt/{config}.yaml'):
yaml_file = f'config/prompt/{config}.yaml'
else:
yaml_file = config
with open(yaml_file, 'r') as yaml_file:
return yaml.safe_load(yaml_file)
def write_jsonl_lines(file, data):
config_wrapper = get_config_wrapper()
if config_wrapper.save_prompt:
json.dump(data, file, ensure_ascii=False)
else:
data.pop(config_wrapper.prompt_key)
json.dump(data, file, ensure_ascii=False)
file.write('\n')
file.flush()
def print_info(info):
print('-' * 100)
print('[INFO] model_name:', info['model_name'])
print('[INFO] splits:', info['splits'])
print('[INFO] modes:', info['modes'])
print('[INFO] output_dir:', info['output_dir'])
print('[INFO] Infer Limit:',
'No limit' if info['infer_limit'] is None else info['infer_limit'])
print('[INFO] Number of Workers:', info['num_workers'])
print('[INFO] Batch Size:', info['batch_size'])
print('[INFO] Use Accel:', info['use_accel'])
print('-' * 100)
def read_json_or_jsonl(data_path, split='', mapping_key=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if mapping_key:
return {
item[mapping_key]: item
for item in data if mapping_key in item
}
else:
return data
def read_json_or_jsonl_with_idx(data_path, split='', idx=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if idx is not None:
try:
return next(item for item in data if item.get('idx') == idx)
except StopIteration:
raise ValueError(f'No entry found for idx {idx}')
else:
return data
idx_ranges = [
[18],
[73, 74, 77],
[94],
[115, 116, 117],
[121, 122, 123, 125],
[131, 132, 134, 135, 136],
[141, 143, 149],
list(range(145, 148)),
list(range(151, 157)),
[160, 161, 162],
[164, 165, 166],
[170],
[206, 209],
list(range(211, 216)),
[217, 218],
]
def clean_json_string(json_str):
json_str = re.sub(r'[\x00-\x1F\x7F]', '', json_str)
return json_str
def is_in_idx_ranges(idx, idx_ranges):
for range_list in idx_ranges:
if int(idx) in range_list:
return True
return False
def extract_json(text):
matches = re.findall(r'{.*}', text, re.DOTALL)
if matches:
json_str = matches[-1]
json_str = clean_json_string(json_str)
try:
data = json.loads(json_str)
return data
except json.JSONDecodeError as e:
print(f'Error decoding JSON: {e}')
return 'NULL'
return 'NULL'
def extract_all_responses_from_json(response_json):
results = []
for key, value in response_json.items():
results.append(str(value))
return results
def clean_latex(latex_expr):
if '=' in latex_expr:
latex_expr = latex_expr.rsplit('=', 1)[1]
latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr)
latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr)
latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr)
latex_expr = latex_expr.replace('\\\\', '\\')
return latex_expr
def extract_text_from_brackets(text, clean_level='basic'):
matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL)
if not matches:
matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL)
if not matches:
matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
if matches:
match_str = matches[0].strip()
if clean_level == 'clean':
match_str = match_str.replace('"', '').replace('\n', '').replace(
' ', '').replace('[', '').replace(']', '')
elif clean_level == 'logic':
match_str = match_str.replace('"', '').replace('\n', '').replace(
' ', '').replace('.', '')
elif clean_level == 'math':
match_str = match_str.replace('"', '').replace('\n', '').replace(
'[', '').replace(']', '').replace('$', '')
return f'{clean_latex(match_str)}'
return f'[[{match_str}]]'
return 'NULL'
def extract_inner_text_from_brackets(text):
if not isinstance(text, str):
print(f'text type: {type(text)}, text value: {text}')
return 'NULL'
match = re.search(r'\[\[(.*?)\]\]', text, re.DOTALL)
return match.group(1) if match else 'NULL'
def extract_numbers(str):
numbers = re.findall(r'\d+', str)
numbers = list(map(int, numbers))
return numbers
def extract_and_sort_inequalities(latex_expr):
pattern = r'(≥|≤)\s*([-]?\d+\.?\d*)'
matches = re.findall(pattern, latex_expr)
extracted_inequalities = [''.join(match) for match in matches]
sorted_inequalities = sorted(extracted_inequalities)
return sorted_inequalities
def rule5_normalize_content(content):
parts = [part for part in content.split(';')]
sorted_parts = sorted(parts)
return sorted_parts
def normalize_string(s):
s = re.sub(r'[^0-9]', '', s)
pairs = s.split(',')
pairs.sort()
return pairs
def remove_commas_and_spaces(s):
return re.sub(r'[,\s\[\]]+', '', s)
def remove_non_alphanumeric(s):
return re.sub(r'\W+', '', s)
def contains_or(answer):
return 'or' in answer
def compare_multi_results(response, answer):
try:
response_text = extract_text_from_brackets(response, 'clean')
response_text = re.sub(r'\\text\{or\}', 'or', response_text)
if response_text == 'NULL':
return False
answer = extract_text_from_brackets(answer, 'clean')
response_split = response_text.strip('[[]]').split('or')
answer_split = answer.strip('[[]]').split('or')
response_sorted = sorted([x.strip() for x in response_split])
answer_sorted = sorted([x.strip() for x in answer_split])
return response_sorted == answer_sorted
except Exception as e:
print(f'Error during comparison: {e}')
return False
def split_or_expression(expression):
return [part.strip() for part in expression.split('or')]
def compare_math_expressions(response, answer):
response_text = extract_text_from_brackets(response, 'math')
answer_text = extract_text_from_brackets(answer, 'math')
if response_text == 'NULL':
return False
if contains_or(answer_text):
response_parts = split_or_expression(response_text)
answer_parts = split_or_expression(answer_text)
try:
response_exprs = {
sp.simplify(parse_latex(part))
for part in response_parts
}
answer_exprs = {
sp.simplify(parse_latex(part))
for part in answer_parts
}
return response_exprs == answer_exprs
except Exception as e:
print(f'Error during simplification or parsing: {e}')
return response_text == answer_text
else:
try:
response_expr = sp.simplify(parse_latex(response_text))
answer_expr = sp.simplify(parse_latex(answer_text))
return response_expr == answer_expr
except Exception as e:
print(f'Error during simplification or parsing: {e}')
return response_text == answer_text
def method_equal(response_text, answer):
return response_text == answer
def method_1(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
answer = re.sub(r'[^A-Za-z]', '', answer)
answer = answer.lower()
return cleaned_string == answer
def method_2(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
answer = answer.split(',')
return cleaned_string in answer
def method_3(response_text, answer):
response_text = response_text.lower()
pairs1 = re.split(r'\W+', response_text)
pairs2 = answer.split(' ')
pairs1 = [word for word in pairs1 if word]
pairs1.sort()
pairs2.sort()
return pairs1 == pairs2
def method_4(response_text, answer):
cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
cleaned_string = cleaned_string.lower()
return cleaned_string in answer
def method_5(response_text, answer):
response_text = re.sub(r'\s+', '', response_text)
response_text = response_text.split(',')
answer = answer.split(',')
response_text.sort()
answer.sort()
return response_text == answer
def method_9(response_text, answer):
response_text = response_text.replace('×', '*').replace('', '-')
answer = answer.replace('×', '*').replace('', '-')
def extract_operators(s):
return re.findall(r'[+\-*/]', s)
response_ops = extract_operators(response_text.split('=')[0])
answer_ops = extract_operators(answer.split('=')[0])
if response_ops != answer_ops:
return False
match = re.search(r'=\s*(-?\d+)', answer)
expected_result = int(match.group(1))
try:
left_side = response_text.split('=')[0]
result = eval(left_side)
except Exception as e:
print(f'Error during evaluation: {e}')
return False
return result == expected_result
def method_10(response_text, answer):
response_text = response_text.replace('×', '*').replace('', '-')
response_text = response_text.split('=')[0]
answer = answer.split('\n')[0].split('=')[0]
response_ops = sorted(remove_non_alphanumeric(response_text))
answer_ops = sorted(remove_non_alphanumeric(answer))
if response_ops != answer_ops:
return False
try:
result = eval(response_text)
except Exception as e:
print(f'Error during evaluation: {e}')
return False
return result == 24
def method_18(response_text, answer):
cleaned_s1 = remove_commas_and_spaces(response_text)
cleaned_s2 = remove_commas_and_spaces(answer)
return cleaned_s1 == cleaned_s2
def method_general(response_text, answer):
cleaned_s1 = remove_non_alphanumeric(response_text)
cleaned_s2 = remove_non_alphanumeric(answer)
return cleaned_s1 == cleaned_s2
question_methods = {
'1': method_1,
'2': method_2,
'3': method_3,
'4': method_4,
'5': method_5,
'9': method_9,
'10': method_10,
'18': method_18,
}
def evaluate_response_vs_answer(response, answer, question_type, rule_id, idx):
if question_type == 'logic' and rule_id == '5':
response_text = extract_text_from_brackets(response, 'logic')
answer_text = extract_text_from_brackets(answer, 'logic')
if response_text is None:
return False
normalized_response = rule5_normalize_content(response_text)
normalized_answer = rule5_normalize_content(answer)
return normalized_response == normalized_answer
elif question_type == 'logic':
response_text = extract_text_from_brackets(response, 'logic')
answer_text = extract_text_from_brackets(answer, 'logic')
return response_text == answer_text
elif question_type == 'operation' and (idx == '178' or idx == '179'):
response_text = extract_text_from_brackets(response, 'clean')
response_text = extract_and_sort_inequalities(response_text)
answer_text = extract_and_sort_inequalities(answer)
# print(response_text, answer_text)
return response_text == answer_text
elif question_type == 'operation' and rule_id == '18':
response_text = extract_text_from_brackets(response, 'clean')
answer = extract_inner_text_from_brackets(answer)
response_text = ''.join(sorted(re.sub(r'\W+', '', response_text)))
answer = ''.join(sorted(re.sub(r'\W+', '', answer)))
return response_text == answer
elif question_type == 'operation' and rule_id in {'23', '24', '25'}:
response_text = extract_text_from_brackets(response, 'clean')
if response_text is None:
return False
response_text = extract_numbers(response_text)
answer_text = extract_numbers(answer)
return response_text == answer_text
elif question_type == 'operation' and is_in_idx_ranges(idx, idx_ranges):
return compare_math_expressions(response, answer)
elif question_type == 'operation' and contains_or(answer):
return compare_multi_results(response, answer)
elif question_type == 'puzzle':
response_text = extract_inner_text_from_brackets(response)
answer = extract_inner_text_from_brackets(answer)
method = question_methods.get(rule_id)
if method:
return method(response_text, answer)
return method_general(response_text, answer)
else:
response_text = extract_text_from_brackets(response, 'clean')
return response_text == answer
def compute_one_mixed_question_pass_rate(idx,
question_list,
response_json,
base_path=None):
if response_json == 'NULL':
result_dict = {
'idx': idx,
'response': response_json,
'details': None,
'pass_rate': 0,
'is_correct': False
}
return result_dict
response_list = extract_all_responses_from_json(response_json)
correct_num = 0
results = []
for q_idx, question in enumerate(question_list):
category, question_idx = question.rsplit('_', 1)
question_content = load_json_or_jsonl_with_idx(base_path,
os.path.join(
category, 'sample'),
idx=question_idx)
answer = question_content['answer']
if q_idx >= len(response_list):
break
response = response_list[q_idx]
response_text = extract_text_from_brackets(response)
rule_id = question_content['rule_id']
is_correct = evaluate_response_vs_answer(response, answer, category,
rule_id, q_idx)
if is_correct:
correct_num += 1
results.append({
'question': question,
'response_text': response_text,
'answer': answer,
'is_correct': is_correct
})
pass_rate = correct_num / len(question_list)
question_correct = pass_rate == 1.0
result_dict = {
'idx': idx,
'response': response_json,
'details': results,
'pass_rate': pass_rate,
'is_correct': question_correct
}
return result_dict
def evaluate_responses(data, mode, base_path=None):
results = []
# Iterate over the values of the dictionary (numerical keys)
for key, record in data.items():
idx = key # Use the dictionary key as the "idx"
response = record.get('prediction', '')
question_type = record.get('category', '')
if mode == 'mixed':
question_list = record.get('question_list')
response_json = extract_json(response)
result_dict = compute_one_mixed_question_pass_rate(
idx, question_list, response_json, base_path)
results.append(result_dict)
else:
response_text = extract_text_from_brackets(response)
answer = record.get('gold', '')
rule_id = record.get('rule_id', '')
is_correct = evaluate_response_vs_answer(response, answer,
question_type, rule_id,
idx)
result_dict = {
'idx': idx,
'response': response,
'response_text': response_text,
'answer': answer,
'is_correct': is_correct
}
if question_type == 'counterfactual':
real_life_answer = record.get('real_life_answer', '')
is_real_life = evaluate_response_vs_answer(
response, real_life_answer, question_type, rule_id, idx)
result_dict['real_life_answer'] = real_life_answer
result_dict['is_real_life'] = is_real_life
if question_type == 'cipher' and mode == 'subquestions':
result_dict['type'] = record.get('type', '')
results.append(result_dict)
return results

View File

@ -0,0 +1,267 @@
# flake8: noqa
"""KOR-Bench Evaluator."""
import json
import os
import re
from .icl_base_evaluator import BaseEvaluator
def read_json_or_jsonl(data_path, split='', mapping_key=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if mapping_key:
return {
item[mapping_key]: item
for item in data if mapping_key in item
}
else:
return data
def read_json_or_jsonl_with_idx(data_path, split='', idx=None):
base_path = os.path.join(data_path, split)
if os.path.exists(f'{base_path}.json'):
file_path = f'{base_path}.json'
elif os.path.exists(f'{base_path}.jsonl'):
file_path = f'{base_path}.jsonl'
elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
file_path = base_path
else:
raise FileNotFoundError('No JSON or JSONL file found.')
with open(file_path, 'r', encoding='utf-8') as file:
if file_path.endswith('.json'):
data = json.load(file)
elif file_path.endswith('.jsonl'):
data = [json.loads(line) for line in file]
if idx is not None:
try:
return next(item for item in data if item.get('idx') == idx)
except StopIteration:
raise ValueError(f'No entry found for idx {idx}')
else:
return data
class korbenchEvaluator(BaseEvaluator):
"""Evaluator class for KOR-Bench tasks, inheriting from BaseEvaluator.
This class implements the `score` method to evaluate the model's
predictions against the reference answers, using the evaluation logic
specific to KOR-Bench.
"""
def __init__(self, question_type, mode):
"""Initialize the evaluator with question type and mode.
Args:
question_type (str): Type of questions (e.g., 'logic', 'operation', 'puzzle').
mode (str): Evaluation mode (e.g., 'zero-shot', 'self-correction').
"""
super().__init__()
self.question_type = question_type
self.mode = mode
# Predefined index ranges for special evaluation cases
self.idx_ranges = [
[18],
[73, 74, 77],
[94],
[115, 116, 117],
[121, 122, 123, 125],
[131, 132, 134, 135, 136],
[141, 143, 149],
list(range(145, 148)),
list(range(151, 157)),
[160, 161, 162],
[164, 165, 166],
[170],
[206, 209],
list(range(211, 216)),
[217, 218],
]
def score(self, predictions, references):
"""Evaluates the model's predictions against the references.
Args:
predictions (list): List of model predictions.
references (list): List of reference answers (each reference is a dict).
Returns:
list: Evaluation results for each prediction.
"""
if len(predictions) != len(references):
return {
'error': 'Predictions and references have different lengths'
}
data = []
for idx, (prediction,
reference) in enumerate(zip(predictions, references)):
record = {
'idx': str(idx),
'response': prediction,
'answer': reference.get('answer'),
'rule_id': reference.get('rule_id'),
'question_type': self.question_type,
# Include other necessary fields from reference if needed
}
data.append(record)
results = self.evaluate_responses(data, self.question_type, self.mode)
return results
def evaluate_responses(self, data, question_type, mode):
"""Evaluates a list of responses.
Args:
data (list): List of records containing responses and answers.
question_type (str): Type of questions.
mode (str): Evaluation mode.
Returns:
list: List of evaluation results.
"""
results = []
for record in data:
idx = record.get('idx')
response = record.get('response')
answer = record.get('answer')
rule_id = record.get('rule_id')
response_text = self.extract_text_from_brackets(response)
is_correct = self.evaluate_response_vs_answer(
response, answer, question_type, rule_id, idx)
result_dict = {
'idx': idx,
'response': response,
'response_text': response_text,
'answer': answer,
'is_correct': is_correct
}
results.append(result_dict)
return results
# Helper methods
def extract_text_from_brackets(self, text, clean_level='basic'):
"""Extracts text enclosed in double brackets [[ ]].
Args:
text (str): The text to extract from.
clean_level (str): The level of cleaning to perform.
Returns:
str: The extracted text or "NULL" if not found.
"""
matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL)
if not matches:
matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL)
if not matches:
matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
if matches:
match_str = matches[0].strip()
if clean_level == 'clean':
match_str = match_str.replace('"', '').replace(
'\n', '').replace(' ', '').replace('[',
'').replace(']', '')
elif clean_level == 'logic':
match_str = match_str.replace('"',
'').replace('\n', '').replace(
' ', '').replace('.', '')
elif clean_level == 'math':
match_str = match_str.replace('"', '').replace(
'\n', '').replace('[', '').replace(']',
'').replace('$', '')
return f'{self.clean_latex(match_str)}'
return f'[[{match_str}]]'
return 'NULL'
def clean_latex(self, latex_expr):
"""Cleans LaTeX expressions for parsing.
Args:
latex_expr (str): The LaTeX expression to clean.
Returns:
str: The cleaned expression.
"""
if '=' in latex_expr:
latex_expr = latex_expr.rsplit('=', 1)[1]
latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr)
latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr)
latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr)
latex_expr = latex_expr.replace('\\\\', '\\')
return latex_expr
def evaluate_response_vs_answer(self, response, answer, question_type,
rule_id, idx):
"""Evaluates a single response against the answer.
Args:
response (str): The model's response.
answer (str): The reference answer.
question_type (str): The question type.
rule_id (str): The rule ID.
idx (str): The index of the question.
Returns:
bool: True if the response is correct, False otherwise.
"""
if question_type == 'logic' and rule_id == '5':
response_text = self.extract_text_from_brackets(response, 'logic')
answer_text = self.extract_text_from_brackets(answer, 'logic')
if response_text is None:
return False
normalized_response = self.rule5_normalize_content(response_text)
normalized_answer = self.rule5_normalize_content(answer)
return normalized_response == normalized_answer
elif question_type == 'logic':
response_text = self.extract_text_from_brackets(response, 'logic')
answer_text = self.extract_text_from_brackets(answer, 'logic')
return response_text == answer_text
else:
response_text = self.extract_text_from_brackets(response, 'clean')
return response_text == answer
def rule5_normalize_content(self, content):
"""Normalizes content for rule 5.
Args:
content (str): The content to normalize.
Returns:
list: Sorted list of content parts.
"""
parts = [part.strip() for part in content.split(';')]
sorted_parts = sorted(parts)
return sorted_parts
# Additional helper methods can be defined here
# For example: methods to handle mathematical expressions, logic comparisons, etc.
# Implement other helper functions as per your evaluation logic
# Example usage:
# evaluator = korbenchEvaluator(question_type='logic', mode='zero-shot')
# results = evaluator.score(predictions, references)

View File

@ -151,6 +151,12 @@ DATASETS_MAPPING = {
"hf_id": "opencompass/humaneval",
"local": "./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl",
},
#KORBENCH
"opencompass/korbench": {
"ms_id": "",
"hf_id": "",
"local": "./data/korbench",
},
# Lambada
"opencompass/lambada": {
"ms_id": "opencompass/lambada",
@ -544,4 +550,8 @@ DATASETS_URL = {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip",
"md5": "e400864c31bc58d29eaa3e199751f99b",
},
"/korbench": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
"md5": "9107597d137e7362eaf7d218ddef7a6d",
},
}