diff --git a/opencompass/__init__.py b/opencompass/__init__.py index 80eb7f98..bfeb9e74 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1 +1 @@ -__version__ = '0.3.3' +__version__ = '0.3.4' diff --git a/opencompass/configs/datasets/humaneval/internal_humaneval_gen_ce6b06.py b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_ce6b06.py new file mode 100644 index 00000000..53505e52 --- /dev/null +++ b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_ce6b06.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='# Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v2_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/humaneval/internal_humaneval_gen_d2537e.py b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_d2537e.py new file mode 100644 index 00000000..cb8e6223 --- /dev/null +++ b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_d2537e.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v1_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index 98acdf41..e38e87f0 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -184,3 +184,71 @@ def humaneval_postprocess_v2(text: str) -> str: if len(blocks) >= 1: text = blocks[0] return text + + +def humaneval_internal_v2_postprocess(text: str): + if text.startswith(' ') and not text.startswith(' '): + text = ' ' + text + prediction = text.split('\n\n\n')[0] + prediction = prediction.split('\n```')[0] + prediction_list = prediction.split('\n') + return_list = [] + for line in prediction_list: + if line and line[0] != ' ': + break + return_list.append(line) + return '\n'.join(return_list) + +def humaneval_internal_v1_postprocess(text: str) -> str: + """This is an advanced version of previous postprocess to handle more + situations, better to use this one.""" + try: + # for chatGLM related text + eval_text = eval(text) + except Exception: + pass + else: + if isinstance(eval_text, str): + text = eval_text + text = text.lstrip('\n') + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # in case starting with ```python + text = text[max(text.find('\n') + 1, 0) :] + if text.strip().startswith('from') or text.strip().startswith('import'): + def_idx = text.find('def') + if def_idx != -1: + text = text[max(text.find('\n', def_idx) + 1, 0) :] + # remove empty lines + text = '\n'.join([line for line in text.split('\n') if line != '']) + text = text.lstrip('\n') + if text.strip().startswith('def'): + text = '\n'.join(text.split('\n')[1:]) + # deal with the indentation error + if text.startswith(' '): + text = ' ' + text.lstrip() + else: + text = '\n'.join([' ' + line for line in text.split('\n')]) + text = text.split('\n') + + # If number of leading space reduces, we assume that the code block ends. + min_leading_space = None + end_index = None + for index, line in enumerate(text): + if line.strip() == '' or line.strip()[0] in ["'", '"', '#']: + continue + current_leading_space = len(line.rstrip()) - len(line.strip()) + if min_leading_space is None: + min_leading_space = current_leading_space + elif current_leading_space < min_leading_space: + end_index = index + break + if end_index is not None: + text = '\n'.join(text[:end_index]) + else: + text = '\n'.join(text) + return text diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 7d4e3891..f2a367b2 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -73,7 +73,11 @@ class TurboMindModelwithChatTemplate(BaseModel): else: assert isinstance(generation_config.eos_token_id, list) for token_id in generation_config.eos_token_id: - potential_stop_words.append(self.tokenizer.decode(token_id)) + stop_word = self.tokenizer.decode(token_id) + if stop_word.startswith(' '): + self.logger.warning(f'stop_word "{stop_word}" contains blanks, which will be stripped') + stop_word = stop_word.strip() + potential_stop_words.append(stop_word) if self.tokenizer.eos_token is not None: potential_stop_words.append(self.tokenizer.eos_token) potential_stop_words = list(set(potential_stop_words)) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 87fee8c4..094c4269 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -42,14 +42,15 @@ class DLCRunner(BaseRunner): eval_with_gpu: list = ['plugin_eval'], retry: int = 2, debug: bool = False, - lark_bot_url: str = None): + lark_bot_url: str = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.aliyun_cfg = aliyun_cfg self.max_num_workers = max_num_workers self.retry = retry self.eval_with_gpu = eval_with_gpu - + self.keep_tmp_file = keep_tmp_file logger = get_logger() logger.warning( 'To ensure the integrity of the log results, the log displayed ' @@ -106,7 +107,10 @@ class DLCRunner(BaseRunner): # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'tmp/{uuid_str}_params.py' pwd = os.getcwd() try: cfg.dump(param_file) @@ -164,20 +168,27 @@ class DLCRunner(BaseRunner): # set priority to 1 as default task_priority = self.aliyun_cfg.get('priority', 1) + # Different dlc versions has different commands + if self.aliyun_cfg.get('dlc_job_cmd') == 'create': + dlc_job_cmd = 'create job --kind PyTorchJob' + worker_cmd = ' --worker_count 1' + else: + dlc_job_cmd = 'submit pytorchjob' + worker_cmd = ' --workers 1' tmpl = ( - 'dlc submit pytorchjob' + f'dlc {dlc_job_cmd}' f" --command '{shell_cmd}'" f' --name {task_name[:512]}' f" --config {self.aliyun_cfg['dlc_config_path']}" f" --workspace_id {self.aliyun_cfg['workspace_id']}" - f" --resource_id {self.aliyun_cfg['resource_id']}" + f" --resource_id={self.aliyun_cfg['resource_id']}" f' --priority {task_priority}' - ' --workers 1' + f'{worker_cmd}' f' --worker_cpu {max(num_gpus * 8, 12)}' f' --worker_gpu {num_gpus}' f' --worker_memory {max(num_gpus * 128, 192)}Gi' f" --worker_image {self.aliyun_cfg['worker_image']}" - f" --data_sources {','.join(self.aliyun_cfg['data_sources'])}") + f" --data_sources={','.join(self.aliyun_cfg['data_sources'])}") get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) @@ -298,7 +309,10 @@ class DLCRunner(BaseRunner): return_code = _run_within_retry() finally: # Clean up - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass return task_name, return_code diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index 3ec1c627..8306e89e 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -56,10 +56,12 @@ class LocalRunner(BaseRunner): debug: bool = False, max_workers_per_gpu: int = 1, lark_bot_url: str = None, + keep_tmp_file: bool = False, **kwargs): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.max_num_workers = max_num_workers self.max_workers_per_gpu = max_workers_per_gpu + self.keep_tmp_file = keep_tmp_file logger = get_logger() for k, v in kwargs.items(): logger.warning(f'Ignored argument in {self.__module__}: {k}={v}') @@ -100,7 +102,10 @@ class LocalRunner(BaseRunner): assert len(all_gpu_ids) >= num_gpus # get cmd mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + import uuid + uuid_str = str(uuid.uuid4()) + + param_file = f'tmp/{uuid_str}_params.py' try: task.cfg.dump(param_file) # if use torchrun, restrict it behaves the same as non @@ -140,7 +145,10 @@ class LocalRunner(BaseRunner): stdout=log_file, stderr=subprocess.STDOUT) finally: - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass status.append((task_name, 0)) else: if len(all_gpu_ids) > 0: diff --git a/opencompass/runners/slurm_sequential.py b/opencompass/runners/slurm_sequential.py index 3b4dcad5..5dee149c 100644 --- a/opencompass/runners/slurm_sequential.py +++ b/opencompass/runners/slurm_sequential.py @@ -24,11 +24,11 @@ class SlurmSequentialRunner(BaseRunner): using `srun` command. This runner launches tasks one by one for execution. A new task will only - be launched when and only when max_num_workers is not met, and the previous - task has been successfully allocated to a machine. Therefore, unlike the - `SlurmRunner`, at most only one task will be in the PENDING status at the - same time during a run, making the random_sleep strategy no longer - necessary. In addition, this runner also includes a feature to + be launched when and only when max_num_workers is not met, and the + previous task has been successfully allocated to a machine. Therefore, + unlike the `SlurmRunner`, at most only one task will be in the PENDING + status at the same time during a run, making the random_sleep strategy + no longer necessary. In addition, this runner also includes a feature to automatically kill all jobs by the job_id on exit. The runner will obtain the job_id by reading the srun output similar to @@ -59,7 +59,8 @@ class SlurmSequentialRunner(BaseRunner): qos: str = None, debug: bool = False, lark_bot_url: str = None, - extra_command: Optional[List[str]] = None): + extra_command: Optional[List[str]] = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.max_num_workers = max_num_workers self.retry = retry @@ -67,6 +68,7 @@ class SlurmSequentialRunner(BaseRunner): self.quotatype = quotatype self.qos = qos self.task_prefix = task_prefix + self.keep_tmp_file = keep_tmp_file if not extra_command: extra_command = [] assert isinstance(extra_command, list) @@ -171,7 +173,10 @@ class SlurmSequentialRunner(BaseRunner): # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'tmp/{uuid_str}_params.py' process = None try: cfg.dump(param_file) @@ -256,7 +261,11 @@ class SlurmSequentialRunner(BaseRunner): child_conn.close() if process is not None: process.kill() - os.remove(param_file) + if not self.keep_tmp_file: + os.remove(param_file) + else: + pass + return task_name, process.returncode def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index 9a94ea67..f076daa6 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -47,7 +47,8 @@ class VOLCRunner(BaseRunner): max_num_workers: int = 32, retry: int = 2, debug: bool = False, - lark_bot_url: str = None): + lark_bot_url: str = None, + keep_tmp_file: bool = False): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.volcano_cfg = volcano_cfg self.max_num_workers = max_num_workers @@ -55,6 +56,7 @@ class VOLCRunner(BaseRunner): self.queue_name = queue_name self.preemptible = preemptible self.priority = priority + self.keep_tmp_file = keep_tmp_file def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: """Launch multiple tasks. @@ -100,9 +102,12 @@ class VOLCRunner(BaseRunner): pwd = os.getcwd() # Dump task config to file mmengine.mkdir_or_exist('tmp/') - param_file = f'{pwd}/tmp/{os.getpid()}_params.py' + # Using uuid to avoid filename conflict + import uuid + uuid_str = str(uuid.uuid4()) + param_file = f'{pwd}/tmp/{uuid_str}_params.py' - volc_cfg_file = f'{pwd}/tmp/{os.getpid()}_cfg.yaml' + volc_cfg_file = f'{pwd}/tmp/{uuid_str}_cfg.yaml' volc_cfg = self._choose_flavor(num_gpus) with open(volc_cfg_file, 'w') as fp: yaml.dump(volc_cfg, fp, sort_keys=False) @@ -191,8 +196,12 @@ class VOLCRunner(BaseRunner): finally: # Clean up - os.remove(param_file) - os.remove(volc_cfg_file) + if not self.keep_tmp_file: + os.remove(param_file) + os.remove(volc_cfg_file) + else: + pass + return task_name, returncode def _run_task(self, cmd, log_path, poll_interval): diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index 93dab27a..8a0da5b2 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -299,16 +299,34 @@ class DefaultSummarizer: raw_txts = '\n'.join(raw_txts) return raw_txts + @staticmethod + def _format_md_table(table): + table_head_str = '| ' + ' | '.join(table[0]) + ' |\n' + table_mid_list = ['-----' for _ in range(len(table[0]))] + table_mid_str = '|' + ' | '.join(table_mid_list) + '|\n' + + md_table_str = table_head_str + table_mid_str + for row in table[1:]: + curr_str = '| ' + ' | '.join(row) + ' |\n' + md_table_str += curr_str + return md_table_str + def _output_to_file(self, output_path, time_str, table, raw_txts): # output to file if output_path is None: output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') + output_md_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.md') else: output_csv_path = output_path.replace('.txt', '.csv') + output_md_path = output_path.replace('.txt', '.md') output_dir = osp.split(output_path)[0] mmengine.mkdir_or_exist(output_dir) + + # process md table + md_table = self._format_md_table(table) + with open(output_path, 'w', encoding='utf-8') as f: text = f'{time_str}\n' + \ 'tabulate format\n' + \ @@ -320,6 +338,10 @@ class DefaultSummarizer: '^' * 128 + '\n' + \ '\n'.join([','.join(row) for row in table]) + '\n' + \ '$' * 128 + '\n\n' + \ + 'markdown format\n' + \ + '^' * 128 + '\n' + \ + md_table + '\n' + \ + '$' * 128 + '\n' + \ '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ 'raw format\n' + \ '^' * 128 + '\n' + \ @@ -332,6 +354,11 @@ class DefaultSummarizer: f.write('\n'.join([','.join(row) for row in table]) + '\n') self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') + with open(output_md_path, 'w', encoding='utf-8') as f: + f.write(md_table) + print(f'\n\nThe markdown format results is as below:\n\n{md_table}') + self.logger.info(f'write markdown summary to {osp.abspath(output_md_path)}') + def summarize( self, output_path: str = None,