diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py new file mode 100644 index 00000000..226746c0 --- /dev/null +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) + +bigcodebench_full_reader_cfg = dict( + input_columns=['instruct_prompt'], + output_column='test', +) + +bigcodebench_full_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{instruct_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, + max_out_len=8192)) + +bigcodebench_full_eval_cfg = dict( + evaluator=dict( + type=BigCodeBenchEvaluator, + release_version='v0.1.2', + eval_type='instruct', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 + dataset_version='full', + ), + pred_role='BOT', +) + +bigcodebench_full_instruct_datasets = [ + dict(abbr='bigcodebench_full_instruct', + type=BigCodeBenchDataset, + path='opencompass/bigcodebench', + reader_cfg=bigcodebench_full_reader_cfg, + infer_cfg=bigcodebench_full_infer_cfg, + eval_cfg=bigcodebench_full_eval_cfg, + release_version='v0.1.2', + n=3, + k=2) +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py new file mode 100644 index 00000000..b3804003 --- /dev/null +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) + +bigcodebench_hard_reader_cfg = dict( + input_columns=['instruct_prompt'], + output_column='test', +) + +bigcodebench_hard_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{instruct_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +bigcodebench_hard_eval_cfg = dict( + evaluator=dict( + type=BigCodeBenchEvaluator, + release_version='v0.1.2', + eval_type='instruct', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api= + 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 + dataset_version='hard', + ), + pred_role='BOT', +) + +bigcodebench_hard_instruct_datasets = [ + dict( + abbr='bigcodebench_hard_instruct', + type=BigCodeBenchDataset, + path='opencompass/bigcodebench', + reader_cfg=bigcodebench_hard_reader_cfg, + infer_cfg=bigcodebench_hard_infer_cfg, + eval_cfg=bigcodebench_hard_eval_cfg, + release_version='v0.1.2', + dataset_version='hard', + n=3, + k=2 + ) +] diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index 9ce3d196..59c030d4 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -188,7 +188,9 @@ class BigCodeBenchEvaluator(BaseEvaluator): while True: try: eval_client = Client(self.remote_execute_api, - httpx_kwargs=dict(proxies=proxies)) + httpx_kwargs=dict( + proxies=proxies, + timeout=httpx.Timeout(100.0))) results, pass_at_k = eval_client.predict( split=self.eval_type, samples=handle_file(submitted_contents_path), @@ -196,7 +198,7 @@ class BigCodeBenchEvaluator(BaseEvaluator): **self.eval_kwargs) break except (httpx.ReadTimeout, CancelledError): - logger.info('Read timeout error. Retrying in 4s...') + logger.info('Read timeout error. Retrying in 10s...') time.sleep(10) if 'pass@1' in pass_at_k.keys():