OpenCompass/configs/summarizers/groups/cibench.py
klein e4830a6926
Update CIBench (#1089)
* modify the requirements/runtime.txt: numpy==1.23.4 --> numpy>=1.23.4

* update cibench: dataset and evluation

* cibench summarizer bug

* update cibench

* move extract_code import

---------

Co-authored-by: zhangchuyu@pjlab.org.cn <zhangchuyu@pjlab.org.cn>
Co-authored-by: Leymore <zfz-960727@163.com>
2024-04-26 18:46:02 +08:00

395 lines
20 KiB
Python

_cibench_generation_modules = ['pandas', 'matplotlib', 'opencv', 'scipy', 'seaborn', 'pytorch']
_cibench_generation = ['cibench_generation/' + i for i in _cibench_generation_modules]
cibench_summary_groups = []
_cibench_generation_weight = {
'matplotlib': [223, 50, 1, 156],
'pandas': [200, 45, 45, 38],
'pytorch': [69, 0, 8, 11],
'seaborn': [130, 0, 2, 106],
'opencv': [177, 21, 6, 106],
'scipy': [161, 94, 14, 49],
}
cibench_summary_groups.extend([
{
'name': 'cibench_generation:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_generation],
'weights': {'cibench_generation/' + k : v[0] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation:executable',
'subsets': [[i, 'executable'] for i in _cibench_generation],
'weights': {'cibench_generation/' + k : v[0] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_generation],
'weights': {'cibench_generation/' + k : v[1] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_generation],
'weights': {'cibench_generation/' + k : v[2] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_generation],
'weights': {'cibench_generation/' + k : v[3] for k,v in _cibench_generation_weight.items()},
},
])
_cibench_generation = ['cibench_generation_oracle/' + i for i in _cibench_generation_modules]
cibench_summary_groups.extend([
{
'name': 'cibench_generation_oracle:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_generation],
'weights': {'cibench_generation_oracle/' + k : v[0] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation_oracle:executable',
'subsets': [[i, 'executable'] for i in _cibench_generation],
'weights': {'cibench_generation_oracle/' + k : v[0] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation_oracle:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_generation],
'weights': {'cibench_generation_oracle/' + k : v[1] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation_oracle:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_generation],
'weights': {'cibench_generation_oracle/' + k : v[2] for k,v in _cibench_generation_weight.items()},
},
{
'name': 'cibench_generation_oracle:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_generation],
'weights': {'cibench_generation_oracle/' + k : v[3] for k,v in _cibench_generation_weight.items()},
},
])
_cibench_template_modules = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template = ['cibench_template/' + i for i in _cibench_template_modules]
# number of total exec questions in this module
_cibench_template_weight = {
'lightgbm': [30, 15, 0, 0],
'matplotlib': [42, 0, 0, 36],
'nltk': [70, 30, 20, 10],
'opencv': [60, 10, 0, 40],
'pandas': [60, 40, 0, 10],
'pytorch': [28, 0, 0, 0],
'scipy': [60, 40, 0, 0],
'seaborn': [42, 0, 0, 35],
'sklearn': [42, 6, 0, 18],
'tensorflow': [36, 6, 0, 12],
}
cibench_summary_groups.extend([
{
'name': 'cibench_template:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:executable',
'subsets': [[i, 'executable'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
_cibench_template_oracle = ['cibench_template_oracle/' + i for i in _cibench_template_modules]
cibench_summary_groups.extend([
{
'name': 'cibench_template_oracle:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_template_oracle],
'weights': {'cibench_template_oracle/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_oracle:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_oracle],
'weights': {'cibench_template_oracle/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_oracle:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_oracle],
'weights': {'cibench_template_oracle/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_oracle:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_oracle],
'weights': {'cibench_template_oracle/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_oracle:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_oracle],
'weights': {'cibench_template_oracle/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## chinese
_cibench_template_cn_modules = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn_modules]
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
_cibench_template_cn_oracle = ['cibench_template_oracle_chinese/' + i for i in _cibench_template_cn_modules]
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn_oracle:tool_rate',
'subsets': [[i, 'tool_rate'] for i in _cibench_template_cn_oracle],
'weights': {'cibench_template_oracle_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn_oracle:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn_oracle],
'weights': {'cibench_template_oracle_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn_oracle:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn_oracle],
'weights': {'cibench_template_oracle_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn_oracle:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_cn_oracle],
'weights': {'cibench_template_oracle_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn_oracle:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn_oracle],
'weights': {'cibench_template_oracle_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
########### New summerizer for Category metric
cibench_data_manipulation = [
['cibench_generation/pandas', 'numeric_correct', _cibench_generation_weight['pandas'][1]],
['cibench_generation/pandas', 'text_score', _cibench_generation_weight['pandas'][2]],
['cibench_generation/pandas', 'vis_sim', _cibench_generation_weight['pandas'][3]],
['cibench_template/pandas', 'numeric_correct', _cibench_template_weight['pandas'][1]],
['cibench_template/pandas', 'text_score', _cibench_template_weight['pandas'][2]],
['cibench_template/pandas', 'vis_sim', _cibench_template_weight['pandas'][3]],
]
cibench_data_visualization = [
['cibench_generation/matplotlib', 'numeric_correct', _cibench_generation_weight['matplotlib'][1]],
['cibench_generation/matplotlib', 'text_score', _cibench_generation_weight['matplotlib'][2]],
['cibench_generation/matplotlib', 'vis_sim', _cibench_generation_weight['matplotlib'][3]],
['cibench_generation/seaborn', 'numeric_correct', _cibench_generation_weight['seaborn'][1]],
['cibench_generation/seaborn', 'text_score', _cibench_generation_weight['seaborn'][2]],
['cibench_generation/seaborn', 'vis_sim', _cibench_generation_weight['seaborn'][3]],
['cibench_template/matplotlib', 'numeric_correct', _cibench_template_weight['matplotlib'][1]],
['cibench_template/matplotlib', 'text_score', _cibench_template_weight['matplotlib'][2]],
['cibench_template/matplotlib', 'vis_sim', _cibench_template_weight['matplotlib'][3]],
['cibench_template/seaborn', 'numeric_correct', _cibench_template_weight['seaborn'][1]],
['cibench_template/seaborn', 'text_score', _cibench_template_weight['seaborn'][2]],
['cibench_template/seaborn', 'vis_sim', _cibench_template_weight['seaborn'][3]],
]
cibench_modeling = [
['cibench_generation/pytorch', 'numeric_correct', _cibench_generation_weight['pytorch'][1]],
['cibench_generation/pytorch', 'text_score', _cibench_generation_weight['pytorch'][2]],
['cibench_generation/pytorch', 'vis_sim', _cibench_generation_weight['pytorch'][3]],
['cibench_template/pytorch', 'numeric_correct', _cibench_template_weight['pytorch'][1]],
['cibench_template/pytorch', 'text_score', _cibench_template_weight['pytorch'][2]],
['cibench_template/pytorch', 'vis_sim', _cibench_template_weight['pytorch'][3]],
['cibench_template/sklearn', 'numeric_correct', _cibench_template_weight['sklearn'][1]],
['cibench_template/sklearn', 'text_score', _cibench_template_weight['sklearn'][2]],
['cibench_template/sklearn', 'vis_sim', _cibench_template_weight['sklearn'][3]],
['cibench_template/tensorflow', 'numeric_correct', _cibench_template_weight['tensorflow'][1]],
['cibench_template/tensorflow', 'text_score', _cibench_template_weight['tensorflow'][2]],
['cibench_template/tensorflow', 'vis_sim', _cibench_template_weight['tensorflow'][3]],
['cibench_template/lightgbm', 'numeric_correct', _cibench_template_weight['lightgbm'][1]],
['cibench_template/lightgbm', 'text_score', _cibench_template_weight['lightgbm'][2]],
['cibench_template/lightgbm', 'vis_sim', _cibench_template_weight['lightgbm'][3]],
]
cibench_nlp = [
['cibench_template/nltk', 'numeric_correct', _cibench_template_weight['nltk'][1]],
['cibench_template/nltk', 'text_score', _cibench_template_weight['nltk'][2]],
['cibench_template/nltk', 'vis_sim', _cibench_template_weight['nltk'][3]],
]
cibench_ip = [
['cibench_generation/opencv', 'numeric_correct', _cibench_generation_weight['opencv'][1]],
['cibench_generation/opencv', 'text_score', _cibench_generation_weight['opencv'][2]],
['cibench_generation/opencv', 'vis_sim', _cibench_generation_weight['opencv'][3]],
['cibench_template/opencv', 'numeric_correct', _cibench_template_weight['opencv'][1]],
['cibench_template/opencv', 'text_score', _cibench_template_weight['opencv'][2]],
['cibench_template/opencv', 'vis_sim', _cibench_template_weight['opencv'][3]],
]
cibench_math = [
['cibench_generation/scipy', 'numeric_correct', _cibench_generation_weight['scipy'][1]],
['cibench_generation/scipy', 'text_score', _cibench_generation_weight['scipy'][2]],
['cibench_generation/scipy', 'vis_sim', _cibench_generation_weight['scipy'][3]],
['cibench_template/scipy', 'numeric_correct', _cibench_template_weight['scipy'][1]],
['cibench_template/scipy', 'text_score', _cibench_template_weight['scipy'][2]],
['cibench_template/scipy', 'vis_sim', _cibench_template_weight['scipy'][3]],
]
cibench_summary_groups.extend([
{
'name': 'cibench_data_manipulation:scores',
'subsets': [i[:2] for i in cibench_data_manipulation],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_data_manipulation},
},
{
'name': 'cibench_data_visualization:scores',
'subsets': [i[:2] for i in cibench_data_visualization],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_data_visualization},
},
{
'name': 'cibench_modeling:scores',
'subsets': [i[:2] for i in cibench_modeling],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_modeling},
},
{
'name': 'cibench_nlp:scores',
'subsets': [i[:2] for i in cibench_nlp],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_nlp},
},
{
'name': 'cibench_ip:scores',
'subsets': [i[:2] for i in cibench_ip],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_ip},
},
{
'name': 'cibench_math:scores',
'subsets': [i[:2] for i in cibench_math],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_math},
},
])
########### New summerizer for Category metric oracle
cibench_data_manipulation = [
['cibench_generation_oracle/pandas', 'numeric_correct', _cibench_generation_weight['pandas'][1]],
['cibench_generation_oracle/pandas', 'text_score', _cibench_generation_weight['pandas'][2]],
['cibench_generation_oracle/pandas', 'vis_sim', _cibench_generation_weight['pandas'][3]],
['cibench_template_oracle/pandas', 'numeric_correct', _cibench_template_weight['pandas'][1]],
['cibench_template_oracle/pandas', 'text_score', _cibench_template_weight['pandas'][2]],
['cibench_template_oracle/pandas', 'vis_sim', _cibench_template_weight['pandas'][3]],
]
cibench_data_visualization = [
['cibench_generation_oracle/matplotlib', 'numeric_correct', _cibench_generation_weight['matplotlib'][1]],
['cibench_generation_oracle/matplotlib', 'text_score', _cibench_generation_weight['matplotlib'][2]],
['cibench_generation_oracle/matplotlib', 'vis_sim', _cibench_generation_weight['matplotlib'][3]],
['cibench_generation_oracle/seaborn', 'numeric_correct', _cibench_generation_weight['seaborn'][1]],
['cibench_generation_oracle/seaborn', 'text_score', _cibench_generation_weight['seaborn'][2]],
['cibench_generation_oracle/seaborn', 'vis_sim', _cibench_generation_weight['seaborn'][3]],
['cibench_template_oracle/matplotlib', 'numeric_correct', _cibench_template_weight['matplotlib'][1]],
['cibench_template_oracle/matplotlib', 'text_score', _cibench_template_weight['matplotlib'][2]],
['cibench_template_oracle/matplotlib', 'vis_sim', _cibench_template_weight['matplotlib'][3]],
['cibench_template_oracle/seaborn', 'numeric_correct', _cibench_template_weight['seaborn'][1]],
['cibench_template_oracle/seaborn', 'text_score', _cibench_template_weight['seaborn'][2]],
['cibench_template_oracle/seaborn', 'vis_sim', _cibench_template_weight['seaborn'][3]],
]
cibench_modeling = [
['cibench_generation_oracle/pytorch', 'numeric_correct', _cibench_generation_weight['pytorch'][1]],
['cibench_generation_oracle/pytorch', 'text_score', _cibench_generation_weight['pytorch'][2]],
['cibench_generation_oracle/pytorch', 'vis_sim', _cibench_generation_weight['pytorch'][3]],
['cibench_template_oracle/pytorch', 'numeric_correct', _cibench_template_weight['pytorch'][1]],
['cibench_template_oracle/pytorch', 'text_score', _cibench_template_weight['pytorch'][2]],
['cibench_template_oracle/pytorch', 'vis_sim', _cibench_template_weight['pytorch'][3]],
['cibench_template_oracle/sklearn', 'numeric_correct', _cibench_template_weight['sklearn'][1]],
['cibench_template_oracle/sklearn', 'text_score', _cibench_template_weight['sklearn'][2]],
['cibench_template_oracle/sklearn', 'vis_sim', _cibench_template_weight['sklearn'][3]],
['cibench_template_oracle/tensorflow', 'numeric_correct', _cibench_template_weight['tensorflow'][1]],
['cibench_template_oracle/tensorflow', 'text_score', _cibench_template_weight['tensorflow'][2]],
['cibench_template_oracle/tensorflow', 'vis_sim', _cibench_template_weight['tensorflow'][3]],
['cibench_template_oracle/lightgbm', 'numeric_correct', _cibench_template_weight['lightgbm'][1]],
['cibench_template_oracle/lightgbm', 'text_score', _cibench_template_weight['lightgbm'][2]],
['cibench_template_oracle/lightgbm', 'vis_sim', _cibench_template_weight['lightgbm'][3]],
]
cibench_nlp = [
['cibench_template_oracle/nltk', 'numeric_correct', _cibench_template_weight['nltk'][1]],
['cibench_template_oracle/nltk', 'text_score', _cibench_template_weight['nltk'][2]],
['cibench_template_oracle/nltk', 'vis_sim', _cibench_template_weight['nltk'][3]],
]
cibench_ip = [
['cibench_generation_oracle/opencv', 'numeric_correct', _cibench_generation_weight['opencv'][1]],
['cibench_generation_oracle/opencv', 'text_score', _cibench_generation_weight['opencv'][2]],
['cibench_generation_oracle/opencv', 'vis_sim', _cibench_generation_weight['opencv'][3]],
['cibench_template_oracle/opencv', 'numeric_correct', _cibench_template_weight['opencv'][1]],
['cibench_template_oracle/opencv', 'text_score', _cibench_template_weight['opencv'][2]],
['cibench_template_oracle/opencv', 'vis_sim', _cibench_template_weight['opencv'][3]],
]
cibench_math = [
['cibench_generation_oracle/scipy', 'numeric_correct', _cibench_generation_weight['scipy'][1]],
['cibench_generation_oracle/scipy', 'text_score', _cibench_generation_weight['scipy'][2]],
['cibench_generation_oracle/scipy', 'vis_sim', _cibench_generation_weight['scipy'][3]],
['cibench_template_oracle/scipy', 'numeric_correct', _cibench_template_weight['scipy'][1]],
['cibench_template_oracle/scipy', 'text_score', _cibench_template_weight['scipy'][2]],
['cibench_template_oracle/scipy', 'vis_sim', _cibench_template_weight['scipy'][3]],
]
cibench_summary_groups.extend([
{
'name': 'cibench_data_manipulation_oracle:scores',
'subsets': [i[:2] for i in cibench_data_manipulation],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_data_manipulation},
},
{
'name': 'cibench_data_visualization_oracle:scores',
'subsets': [i[:2] for i in cibench_data_visualization],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_data_visualization},
},
{
'name': 'cibench_modeling_oracle:scores',
'subsets': [i[:2] for i in cibench_modeling],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_modeling},
},
{
'name': 'cibench_nlp_oracle:scores',
'subsets': [i[:2] for i in cibench_nlp],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_nlp},
},
{
'name': 'cibench_ip_oracle:scores',
'subsets': [i[:2] for i in cibench_ip],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_ip},
},
{
'name': 'cibench_math_oracle:scores',
'subsets': [i[:2] for i in cibench_math],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_math},
},
])