diff --git a/configs/models/codellama/hf_codellama_13b.py b/configs/models/codellama/hf_codellama_13b.py
new file mode 100644
index 00000000..2267f923
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_13b.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 13B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-13b',
+        path="codellama/CodeLlama-13b-hf",
+        tokenizer_path='codellama/CodeLlama-13b-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_13b_instruct.py b/configs/models/codellama/hf_codellama_13b_instruct.py
new file mode 100644
index 00000000..01830015
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_13b_instruct.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 13B Instruct
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-13b-Instruct',
+        path="codellama/CodeLlama-13b-Instruct-hf",
+        tokenizer_path='codellama/CodeLlama-13b-Instruct-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_13b_python.py b/configs/models/codellama/hf_codellama_13b_python.py
new file mode 100644
index 00000000..4c5ea0d1
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_13b_python.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 13B Python
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-13b-Python',
+        path="codellama/CodeLlama-13b-Python-hf",
+        tokenizer_path='codellama/CodeLlama-13b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_34b.py b/configs/models/codellama/hf_codellama_34b.py
new file mode 100644
index 00000000..e6dbef89
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_34b.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 34B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-34b',
+        path="codellama/CodeLlama-34b-hf",
+        tokenizer_path='codellama/CodeLlama-34b-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_34b_instruct.py b/configs/models/codellama/hf_codellama_34b_instruct.py
new file mode 100644
index 00000000..63894fd2
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_34b_instruct.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 34B Instruct
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-34b-Instruct',
+        path="codellama/CodeLlama-34b-Instruct-hf",
+        tokenizer_path='codellama/CodeLlama-34b-Instruct-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_34b_python.py b/configs/models/codellama/hf_codellama_34b_python.py
new file mode 100644
index 00000000..4ac82de8
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_34b_python.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 34B Python
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-34b-Python',
+        path="codellama/CodeLlama-34b-Python-hf",
+        tokenizer_path='codellama/CodeLlama-34b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_7b.py b/configs/models/codellama/hf_codellama_7b.py
new file mode 100644
index 00000000..b66f5095
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_7b.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 7B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b',
+        path="codellama/CodeLlama-7b-hf",
+        tokenizer_path='codellama/CodeLlama-7b-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_7b_instruct.py b/configs/models/codellama/hf_codellama_7b_instruct.py
new file mode 100644
index 00000000..1ae4ef84
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_7b_instruct.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 7B Instruct
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b-Instruct',
+        path="codellama/CodeLlama-7b-Instruct-hf",
+        tokenizer_path='codellama/CodeLlama-7b-Instruct-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
diff --git a/configs/models/codellama/hf_codellama_7b_python.py b/configs/models/codellama/hf_codellama_7b_python.py
new file mode 100644
index 00000000..b0cae6da
--- /dev/null
+++ b/configs/models/codellama/hf_codellama_7b_python.py
@@ -0,0 +1,21 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    # CodeLlama 7B Python
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b-Python',
+        path="codellama/CodeLlama-7b-Python-hf",
+        tokenizer_path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
diff --git a/opencompass/datasets/humanevalx.py b/opencompass/datasets/humanevalx.py
index 82055517..ef23262c 100644
--- a/opencompass/datasets/humanevalx.py
+++ b/opencompass/datasets/humanevalx.py
@@ -145,6 +145,13 @@ class HumanevalXEvaluator(BaseEvaluator):
 def _clean_up_code(text: str, language_type: str) -> str:
     """Cleans up the generated code."""
     if language_type.lower() == 'python':
+        # we need to take care of the first line
+        # append extra space for first line for correct indentation
+        for c_index, c in enumerate(text[:5]):
+            if c != ' ':
+                text = ' ' * (4 - c_index) + text
+                break
+
         text_splits = text.split('\n')
         is_empty_line = False
         ind_empty_line = None
diff --git a/tools/collect_code_preds.py b/tools/collect_code_preds.py
new file mode 100644
index 00000000..39bc54ac
--- /dev/null
+++ b/tools/collect_code_preds.py
@@ -0,0 +1,180 @@
+import argparse
+import json
+import os
+import os.path as osp
+import re
+
+import mmengine
+from mmengine import Config
+from mmengine.utils import mkdir_or_exist
+
+from opencompass.datasets.humanevalx import _clean_up_code
+from opencompass.utils import (dataset_abbr_from_cfg, get_infer_output_path,
+                               get_logger, model_abbr_from_cfg)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Collect Humanevalx dataset predictions.')
+    parser.add_argument('config', help='Config file path')
+    parser.add_argument('-r',
+                        '--reuse',
+                        nargs='?',
+                        type=str,
+                        const='latest',
+                        help='Reuse previous outputs & results, and run any '
+                        'missing jobs presented in the config. If its '
+                        'argument is not specified, the latest results in '
+                        'the work_dir will be reused. The argument should '
+                        'also be a specific timestamp, e.g. 20230516_144254'),
+    args = parser.parse_args()
+    return args
+
+
+_LANGUAGE_NAME_DICT = {
+    'cpp': 'CPP',
+    'go': 'Go',
+    'java': 'Java',
+    'js': 'JavaScript',
+    'python': 'Python',
+    'rust': 'Rust',
+}
+FAILED = 0
+SUCCEED = 1
+
+
+def gpt_python_postprocess(ori_prompt: str, text: str) -> str:
+    """Better answer postprocessor for better instruction-aligned models like
+    GPT."""
+    if '```' in text:
+        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+        if len(blocks) == 0:
+            text = text.split('```')[1]  # fall back to default strategy
+        else:
+            text = blocks[0]  # fetch the first code block
+            if not text.startswith('\n'):  # in case starting with ```python
+                text = text[max(text.find('\n') + 1, 0):]
+
+    match_ori = re.search(r'def(.*?)\(', ori_prompt)
+    match = re.search(r'def(.*?)\(', text)
+    if match:
+        if match.group() == match_ori.group():
+            text = re.sub('def(.*?)\n', '', text, count=1)
+
+    for c_index, c in enumerate(text[:5]):
+        if c != ' ':
+            text = ' ' * (4 - c_index) + text
+            break
+
+    text = text.split('\n\n\n')[0]
+    return text
+
+
+def collect_preds(filename: str):
+    # in case the prediction is partial
+    root, ext = osp.splitext(filename)
+    partial_filename = root + '_0' + ext
+    # collect all the prediction results
+    if not osp.exists(osp.realpath(filename)) and not osp.exists(
+            osp.realpath(partial_filename)):
+        print(f'No predictions found for {filename}')
+        return FAILED, None, None
+    else:
+        if osp.exists(osp.realpath(filename)):
+            preds = mmengine.load(filename)
+            pred_strs = [
+                preds[str(i)]['prediction'] for i in range(len(preds))
+            ]
+            ori_prompt_strs = [
+                preds[str(i)]['origin_prompt'] for i in range(len(preds))
+            ]
+        else:
+            filename = partial_filename
+            pred_strs = []
+            ori_prompt_strs = []
+            i = 1
+            while osp.exists(osp.realpath(filename)):
+                preds = mmengine.load(filename)
+                filename = root + f'_{i}' + ext
+                i += 1
+                pred_strs += [
+                    preds[str(i)]['prediction'] for i in range(len(preds))
+                ]
+                ori_prompt_strs += [
+                    preds[str(i)]['origin_prompt'] for i in range(len(preds))
+                ]
+        return SUCCEED, ori_prompt_strs, pred_strs
+
+
+def main():
+    args = parse_args()
+    # initialize logger
+    logger = get_logger(log_level='INFO')
+    cfg = Config.fromfile(args.config)
+    cfg.setdefault('work_dir', './outputs/default/')
+
+    assert args.reuse, 'Please provide the experienment work dir.'
+    if args.reuse:
+        if args.reuse == 'latest':
+            if not os.path.exists(cfg.work_dir) or not os.listdir(
+                    cfg.work_dir):
+                logger.warning('No previous results to reuse!')
+            else:
+                dirs = os.listdir(cfg.work_dir)
+                dir_time_str = sorted(dirs)[-1]
+        else:
+            dir_time_str = args.reuse
+        logger.info(f'Reusing experiements from {dir_time_str}')
+    # update "actual" work_dir
+    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
+
+    for model in cfg.models:
+        model_abbr = model_abbr_from_cfg(model)
+        for dataset in cfg.datasets:
+            dataset_abbr = dataset_abbr_from_cfg(dataset)
+            filename = get_infer_output_path(
+                model, dataset, osp.join(cfg.work_dir, 'predictions'))
+
+            succeed, ori_prompt_strs, pred_strs = collect_preds(filename)
+            if not succeed:
+                continue
+
+            # infer the language type
+            for k, v in _LANGUAGE_NAME_DICT.items():
+                if k in dataset_abbr:
+                    lang = k
+                    task = v
+                    break
+
+            # special postprocess for GPT
+            if 'CodeLlama' not in model_abbr and lang == 'python':
+                predictions = [{
+                    'task_id':
+                    f'{task}/{i}',
+                    'generation':
+                    gpt_python_postprocess(ori_prompt, pred),
+                } for i, (ori_prompt,
+                          pred) in enumerate(zip(ori_prompt_strs, pred_strs))]
+            else:
+                predictions = [{
+                    'task_id': f'{task}/{i}',
+                    'generation': _clean_up_code(pred, lang),
+                } for i, pred in enumerate(pred_strs)]
+
+            # save processed results if not exists
+            result_file_path = os.path.join(cfg['work_dir'], 'humanevalx',
+                                            model_abbr,
+                                            f'humanevalx_{lang}.json')
+            if osp.exists(result_file_path):
+                logger.info(
+                    f'File exists for {model_abbr}, skip copy from predictions.'  # noqa
+                )
+            else:
+                mkdir_or_exist(osp.split(result_file_path)[0])
+                with open(result_file_path, 'w') as f:
+                    for pred in predictions:
+                        f.write(json.dumps(pred) + '\n')
+
+
+if __name__ == '__main__':
+    main()