diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
index a464f527..e114dc61 100644
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@@ -2,7 +2,8 @@ exclude: |
     (?x)^(
       tests/data/|
       opencompass/models/internal/|
-      opencompass/utils/internal/
+      opencompass/utils/internal/|
+      opencompass/openicl/icl_evaluator/hf_metrics/
     )
 repos:
   - repo: https://gitee.com/openmmlab/mirrors-flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8158381e..a4c9e237 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,8 @@ exclude: |
     (?x)^(
       tests/data/|
       opencompass/models/internal/|
-      opencompass/utils/internal/
+      opencompass/utils/internal/|
+      opencompass/openicl/icl_evaluator/hf_metrics/
     )
 repos:
   - repo: https://github.com/PyCQA/flake8
diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py b/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py
new file mode 100644
index 00000000..aa5a0732
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py
@@ -0,0 +1,106 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Accuracy metric."""
+
+import datasets
+from sklearn.metrics import accuracy_score
+
+import evaluate
+
+
+_DESCRIPTION = """
+Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
+Accuracy = (TP + TN) / (TP + TN + FP + FN)
+ Where:
+TP: True positive
+TN: True negative
+FP: False positive
+FN: False negative
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions (`list` of `int`): Predicted labels.
+    references (`list` of `int`): Ground truth labels.
+    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
+    sample_weight (`list` of `float`): Sample weights Defaults to None.
+
+Returns:
+    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
+
+Examples:
+
+    Example 1-A simple example
+        >>> accuracy_metric = evaluate.load("accuracy")
+        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
+        >>> print(results)
+        {'accuracy': 0.5}
+
+    Example 2-The same as Example 1, except with `normalize` set to `False`.
+        >>> accuracy_metric = evaluate.load("accuracy")
+        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
+        >>> print(results)
+        {'accuracy': 3.0}
+
+    Example 3-The same as Example 1, except with `sample_weight` set.
+        >>> accuracy_metric = evaluate.load("accuracy")
+        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
+        >>> print(results)
+        {'accuracy': 0.8778625954198473}
+"""
+
+
+_CITATION = """
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Accuracy(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(datasets.Value("int32")),
+                    "references": datasets.Sequence(datasets.Value("int32")),
+                }
+                if self.config_name == "multilabel"
+                else {
+                    "predictions": datasets.Value("int32"),
+                    "references": datasets.Value("int32"),
+                }
+            ),
+            reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
+        )
+
+    def _compute(self, predictions, references, normalize=True, sample_weight=None):
+        return {
+            "accuracy": float(
+                accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
+            )
+        }
diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py b/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py
new file mode 100644
index 00000000..353301cc
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py
@@ -0,0 +1,158 @@
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ROUGE metric from Google Research github repo. """
+
+# The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt
+import absl  # Here to have a nice missing dependency error message early on
+import datasets
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy  # Here to have a nice missing dependency error message early on
+import six  # Here to have a nice missing dependency error message early on
+from rouge_score import rouge_scorer, scoring
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{lin-2004-rouge,
+    title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
+    author = "Lin, Chin-Yew",
+    booktitle = "Text Summarization Branches Out",
+    month = jul,
+    year = "2004",
+    address = "Barcelona, Spain",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W04-1013",
+    pages = "74--81",
+}
+"""
+
+_DESCRIPTION = """\
+ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for
+evaluating automatic summarization and machine translation software in natural language processing.
+The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.
+
+Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.
+
+This metrics is a wrapper around Google Research reimplementation of ROUGE:
+https://github.com/google-research/google-research/tree/master/rouge
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates average rouge scores for a list of hypotheses and references
+Args:
+    predictions: list of predictions to score. Each prediction
+        should be a string with tokens separated by spaces.
+    references: list of reference for each prediction. Each
+        reference should be a string with tokens separated by spaces.
+    rouge_types: A list of rouge types to calculate.
+        Valid names:
+        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
+        `"rougeL"`: Longest common subsequence based scoring.
+        `"rougeLsum"`: rougeLsum splits text using `"\n"`.
+        See details in https://github.com/huggingface/datasets/issues/617
+    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
+    use_aggregator: Return aggregates if this is set to True
+Returns:
+    rouge1: rouge_1 (f1),
+    rouge2: rouge_2 (f1),
+    rougeL: rouge_l (f1),
+    rougeLsum: rouge_lsum (f1)
+Examples:
+
+    >>> rouge = evaluate.load('rouge')
+    >>> predictions = ["hello there", "general kenobi"]
+    >>> references = ["hello there", "general kenobi"]
+    >>> results = rouge.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
+"""
+
+
+class Tokenizer:
+    """Helper class to wrap a callable into a class with a `tokenize` method as used by rouge-score."""
+
+    def __init__(self, tokenizer_func):
+        self.tokenizer_func = tokenizer_func
+
+    def tokenize(self, text):
+        return self.tokenizer_func(text)
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Rouge(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence")),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
+            reference_urls=[
+                "https://en.wikipedia.org/wiki/ROUGE_(metric)",
+                "https://github.com/google-research/google-research/tree/master/rouge",
+            ],
+        )
+
+    def _compute(
+        self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False, tokenizer=None
+    ):
+        if rouge_types is None:
+            rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+
+        multi_ref = isinstance(references[0], list)
+
+        if tokenizer is not None:
+            tokenizer = Tokenizer(tokenizer)
+
+        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer, tokenizer=tokenizer)
+        if use_aggregator:
+            aggregator = scoring.BootstrapAggregator()
+        else:
+            scores = []
+
+        for ref, pred in zip(references, predictions):
+            if multi_ref:
+                score = scorer.score_multi(ref, pred)
+            else:
+                score = scorer.score(ref, pred)
+            if use_aggregator:
+                aggregator.add_scores(score)
+            else:
+                scores.append(score)
+
+        if use_aggregator:
+            result = aggregator.aggregate()
+            for key in result:
+                result[key] = result[key].mid.fmeasure
+
+        else:
+            result = {}
+            for key in scores[0]:
+                result[key] = list(score[key].fmeasure for score in scores)
+
+        return result
diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py b/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py
new file mode 100644
index 00000000..6e756f4d
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py
@@ -0,0 +1,178 @@
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SACREBLEU metric. """
+
+import datasets
+import sacrebleu as scb
+from packaging import version
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores.
+Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text.
+It also knows all the standard test sets and handles downloading, processing, and tokenization for you.
+
+See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information.
+"""
+
+_KWARGS_DESCRIPTION = """
+Produces BLEU scores along with its sufficient statistics
+from a source against one or more references.
+
+Args:
+    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
+    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
+    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
+        - `'none'`: no smoothing
+        - `'floor'`: increment zero counts
+        - `'add-k'`: increment num/denom by k for n>1
+        - `'exp'`: exponential decay
+    smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`).
+    tokenize (`str`): Tokenization method to use for BLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are:
+        - `'none'`: No tokenization.
+        - `'zh'`: Chinese tokenization.
+        - `'13a'`: mimics the `mteval-v13a` script from Moses.
+        - `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses
+        - `'char'`: Language-agnostic character-level tokenization.
+        - `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3).
+    lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`.
+    force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`.
+    use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`.
+
+Returns:
+    'score': BLEU score,
+    'counts': Counts,
+    'totals': Totals,
+    'precisions': Precisions,
+    'bp': Brevity penalty,
+    'sys_len': predictions length,
+    'ref_len': reference length,
+
+Examples:
+
+    Example 1:
+        >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+        >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
+        >>> sacrebleu = evaluate.load("sacrebleu")
+        >>> results = sacrebleu.compute(predictions=predictions, references=references)
+        >>> print(list(results.keys()))
+        ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
+        >>> print(round(results["score"], 1))
+        100.0
+
+    Example 2:
+        >>> predictions = ["hello there general kenobi",
+        ...                 "on our way to ankh morpork"]
+        >>> references = [["hello there general kenobi", "hello there !"],
+        ...                 ["goodbye ankh morpork", "ankh morpork"]]
+        >>> sacrebleu = evaluate.load("sacrebleu")
+        >>> results = sacrebleu.compute(predictions=predictions,
+        ...                             references=references)
+        >>> print(list(results.keys()))
+        ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
+        >>> print(round(results["score"], 1))
+        39.8
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Sacrebleu(evaluate.Metric):
+    def _info(self):
+        if version.parse(scb.__version__) < version.parse("1.4.12"):
+            raise ImportWarning(
+                "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
+                'You can install it with `pip install "sacrebleu>=1.4.12"`.'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            homepage="https://github.com/mjpost/sacreBLEU",
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/mjpost/sacreBLEU"],
+            reference_urls=[
+                "https://github.com/mjpost/sacreBLEU",
+                "https://en.wikipedia.org/wiki/BLEU",
+                "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213",
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        smooth_method="exp",
+        smooth_value=None,
+        force=False,
+        lowercase=False,
+        tokenize=None,
+        use_effective_order=False,
+    ):
+        # if only one reference is provided make sure we still use list of lists
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+
+        references_per_prediction = len(references[0])
+        if any(len(refs) != references_per_prediction for refs in references):
+            raise ValueError("Sacrebleu requires the same number of references for each prediction")
+        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
+        output = scb.corpus_bleu(
+            predictions,
+            transformed_references,
+            smooth_method=smooth_method,
+            smooth_value=smooth_value,
+            force=force,
+            lowercase=lowercase,
+            use_effective_order=use_effective_order,
+            **(dict(tokenize=tokenize) if tokenize else {}),
+        )
+        output_dict = {
+            "score": output.score,
+            "counts": output.counts,
+            "totals": output.totals,
+            "precisions": output.precisions,
+            "bp": output.bp,
+            "sys_len": output.sys_len,
+            "ref_len": output.ref_len,
+        }
+        return output_dict
diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/squad.py b/opencompass/openicl/icl_evaluator/hf_metrics/squad.py
new file mode 100644
index 00000000..84658b12
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/hf_metrics/squad.py
@@ -0,0 +1,111 @@
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SQuAD metric. """
+
+import datasets
+
+import evaluate
+
+from .compute_score import compute_score
+
+
+_CITATION = """\
+@inproceedings{Rajpurkar2016SQuAD10,
+  title={SQuAD: 100, 000+ Questions for Machine Comprehension of Text},
+  author={Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang},
+  booktitle={EMNLP},
+  year={2016}
+}
+"""
+
+_DESCRIPTION = """
+This metric wrap the official scoring script for version 1 of the Stanford Question Answering Dataset (SQuAD).
+
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by
+crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span,
+from the corresponding reading passage, or the question might be unanswerable.
+"""
+
+_KWARGS_DESCRIPTION = """
+Computes SQuAD scores (F1 and EM).
+Args:
+    predictions: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair as given in the references (see below)
+        - 'prediction_text': the text of the answer
+    references: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair (see above),
+        - 'answers': a Dict in the SQuAD dataset format
+            {
+                'text': list of possible texts for the answer, as a list of strings
+                'answer_start': list of start positions for the answer, as a list of ints
+            }
+            Note that answer_start values are not taken into account to compute the metric.
+Returns:
+    'exact_match': Exact match (the normalized answer exactly match the gold answer)
+    'f1': The F-score of predicted tokens versus the gold answer
+Examples:
+
+    >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
+    >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
+    >>> squad_metric = evaluate.load("squad")
+    >>> results = squad_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'exact_match': 100.0, 'f1': 100.0}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Squad(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")},
+                    "references": {
+                        "id": datasets.Value("string"),
+                        "answers": datasets.features.Sequence(
+                            {
+                                "text": datasets.Value("string"),
+                                "answer_start": datasets.Value("int32"),
+                            }
+                        ),
+                    },
+                }
+            ),
+            codebase_urls=["https://rajpurkar.github.io/SQuAD-explorer/"],
+            reference_urls=["https://rajpurkar.github.io/SQuAD-explorer/"],
+        )
+
+    def _compute(self, predictions, references):
+        pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
+        dataset = [
+            {
+                "paragraphs": [
+                    {
+                        "qas": [
+                            {
+                                "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]],
+                                "id": ref["id"],
+                            }
+                            for ref in references
+                        ]
+                    }
+                ]
+            }
+        ]
+        score = compute_score(dataset=dataset, predictions=pred_dict)
+        return score
diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
index 9d40aaf3..92e6797f 100644
--- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
@@ -1,3 +1,4 @@
+import os
 import random
 from typing import List
 
@@ -72,7 +73,13 @@ class HuggingfaceEvaluator(BaseEvaluator):
                 f'length. len(predictions): {len(predictions)}, '
                 f'len(references): {len(references)}'
             }
-        metric = evaluate.load(self.metric)
+        # use codes pre-downloaded to opencompass repo, avoid downloading
+        local_path = os.path.join(os.dirname(os.path.abspath(__file__)),
+                                  'hf_metrics', self.metric + '.py')
+        if os.path.exists(local_path):
+            metric = evaluate.load(local_path)
+        else:
+            metric = evaluate.load(self.metric)
         scores = metric.compute(**self._preprocess(predictions, references))
         result = self._postprocess(scores)
         random.setstate(random_state)