[Sync] add OC16 entry (#1171)

2025-05-30 16:03:24 +08:00 · 2024-05-17 16:50:58 +08:00 · 2024-05-17 16:50:58 +08:00 · 5de85406ce
commit 5de85406ce
parent 94eb90569f
15 changed files with 321 additions and 27 deletions
--- a/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py
+++ b/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
+
+sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',),
+                dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n    res = tuple(set(test_tup1) & set(test_tup2))\n    return (res)```',),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',),
+                dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n    result = False\n    for i in range(2,int(math.sqrt(n)) + 1):\n        if n %% i == 0:\n            result = True\n    return result```',),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',),
+                dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n    largest_nums = hq.nlargest(n, nums)\n    return largest_nums```',),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+sanitized_mbpp_datasets = [
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr='sanitized_mbpp',
+        path='./data/mbpp/sanitized-mbpp.jsonl',
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+]
--- a/configs/datasets/taco/taco_staged_gen_411572.py
+++ b/configs/datasets/taco/taco_staged_gen_411572.py
@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TACODataset, TACOEvaluator
+
+TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
+TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
+
+TACO_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
+
+TACO_datasets = []
+for difficulty in TACO_difficulties_list:
+    TACO_datasets.append(
+        dict(
+            type=TACODataset,
+            abbr='TACO-' + difficulty,
+            path='./data/BAAI-TACO',
+            difficulty=difficulty,
+            reader_cfg=TACO_reader_cfg,
+            infer_cfg=TACO_infer_cfg,
+            eval_cfg=TACO_eval_cfg,
+        )
+    )
--- a/configs/models/deepseek/hf_deepseek_v2.py
+++ b/configs/models/deepseek/hf_deepseek_v2.py
@ -13,6 +13,6 @@ models = [
            max_memory={i: '75GB' for i in range(8)},
            attn_implementation='eager'
        ),
-        run_cfg=dict(num_gpus=4),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/deepseek/hf_deepseek_v2_chat.py
+++ b/configs/models/deepseek/hf_deepseek_v2_chat.py
@ -13,6 +13,6 @@ models = [
            max_memory={i: '75GB' for i in range(8)},
            attn_implementation='eager'
        ),
-        run_cfg=dict(num_gpus=4),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/yi/hf_yi_1_5_34b_chat.py
+++ b/configs/models/yi/hf_yi_1_5_34b_chat.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='yi-1.5-34b-chat-hf',
+        path='01-ai/Yi-1.5-34B-Chat',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/configs/models/yi/hf_yi_1_5_6b_chat.py
+++ b/configs/models/yi/hf_yi_1_5_6b_chat.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='yi-1.5-6b-chat-hf',
+        path='01-ai/Yi-1.5-6B-Chat',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/yi/hf_yi_1_5_9b_chat.py
+++ b/configs/models/yi/hf_yi_1_5_9b_chat.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='yi-1.5-9b-chat-hf',
+        path='01-ai/Yi-1.5-9B-Chat',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@ -288,6 +288,7 @@ class MBPPEvaluator(BaseEvaluator):
            r'(.*)\s*```.*',
            r"\[BEGIN\]\s*'(.*)",
            r'\[BEGIN\](.*)',
+            r"'(.*)'\s*\[DONE\]",
        ]
        for p in patterns:
            match = re.search(p, text, re.DOTALL)
--- a/opencompass/datasets/taco.py
+++ b/opencompass/datasets/taco.py
@ -37,13 +37,16 @@ TIMEOUT = 10
 class TACODataset(BaseDataset):

    @staticmethod
-    def load(path: str, num_repeats: int = 1):
+    def load(path: str, num_repeats: int = 1, difficulty='ALL'):
        dataset = load_from_disk(path)
        new_dataset = DatasetDict()
        # add new column "starter" in the prompt
        for split in dataset.keys():
            new_samples = []
            for idx, sample in enumerate(dataset[split]):
+                if 'ALL' not in difficulty:
+                    if not sample['difficulty'] == difficulty:
+                        continue
                starter_code = None if len(
                    sample['starter_code']) == 0 else sample['starter_code']
                try:
@ -71,7 +74,6 @@ class TACODataset(BaseDataset):
                for key in new_samples[0].keys()
            }
            new_dataset[split] = Dataset.from_dict(new_data)
-
        # num_repeats duplicate
        # train_repeated = []
        test_repeated = []
@ -84,7 +86,6 @@ class TACODataset(BaseDataset):
        #     train_repeated
        # )
        dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
-
        return DatasetDict({
            # 'train': dataset_train_repeated,
            'test': dataset_test_repeated
--- a/opencompass/models/init.py
+++ b/opencompass/models/init.py
@ -42,5 +42,6 @@ from .vllm import VLLM  # noqa: F401
 from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate  # noqa: F401
 from .xunfei_api import XunFei, XunFeiSpark  # noqa: F401
 from .yayi_api import Yayi  # noqa: F401
+from .yi_api import YiAPI  # noqa: F401
 from .zhipuai_api import ZhiPuAI  # noqa: F401
 from .zhipuai_v2_api import ZhiPuV2AI  # noqa: F401
--- a/opencompass/models/huggingface_above_v4_33.py
+++ b/opencompass/models/huggingface_above_v4_33.py
@ -64,7 +64,7 @@ def _convert_chat_messages(inputs):
    for _input in inputs:
        messages = []
        if isinstance(_input, str):
-            messages.append({'role': 'HUMAN', 'content': _input})
+            messages.append({'role': 'user', 'content': _input})
        else:
            for item in _input:
                role = {
@ -165,7 +165,7 @@ class HuggingFacewithChatTemplate(BaseModel):
    def _load_tokenizer(self, path: Optional[str], kwargs: dict, pad_token_id: Optional[int] = None):
        from transformers import AutoTokenizer, GenerationConfig

-        DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', use_fast=False, trust_remote_code=True)
+        DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', trust_remote_code=True)
        tokenizer_kwargs = DEFAULT_TOKENIZER_KWARGS
        tokenizer_kwargs.update(kwargs)
        self.tokenizer = AutoTokenizer.from_pretrained(path, **tokenizer_kwargs)
@ -199,6 +199,7 @@ class HuggingFacewithChatTemplate(BaseModel):
        model_kwargs = DEFAULT_MODEL_KWARGS
        model_kwargs.update(kwargs)
        model_kwargs = _set_model_kwargs_torch_dtype(model_kwargs)
+        self.logger.debug(f'using model_kwargs: {model_kwargs}')

        try:
            self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
--- a/opencompass/models/lightllm_api.py
+++ b/opencompass/models/lightllm_api.py
@ -68,6 +68,7 @@ class LightllmAPI(BaseModel):
            self.wait()
            header = {'content-type': 'application/json'}
            try:
+                self.logger.debug(f'input: {input}')
                data = dict(inputs=input, parameters=self.generation_kwargs)
                raw_response = requests.post(self.url,
                                             headers=header,
@ -80,6 +81,7 @@ class LightllmAPI(BaseModel):
                generated_text = response['generated_text']
                if isinstance(generated_text, list):
                    generated_text = generated_text[0]
+                self.logger.debug(f'generated_text: {generated_text}')
                return generated_text
            except requests.JSONDecodeError:
                self.logger.error('JsonDecode error, got',
--- a/opencompass/models/yi_api.py
+++ b/opencompass/models/yi_api.py
@ -0,0 +1,178 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class YiAPI(BaseAPIModel):
+    """Model wrapper around YiAPI.
+
+    Documentation:
+
+    Args:
+        path (str): The name of YiAPI model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        key: str,
+        url: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+        system_prompt: str = '',
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer ' + key,
+        }
+        self.url = url
+        self.model = path
+        self.system_prompt = system_prompt
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for item in input:
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
+
+        if self.system_prompt:
+            system = {'role': 'system', 'content': self.system_prompt}
+            messages.insert(0, system)
+
+        data = {'model': self.model, 'messages': messages}
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                headers=self.headers,
+                                                json=data)
+            except Exception as err:
+                print('Request Error:{}'.format(err))
+                time.sleep(2)
+                continue
+
+            try:
+                response = raw_response.json()
+            except Exception as err:
+                print('Response Error:{}'.format(err))
+                response = None
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+
+            if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
+                msg = response['choices'][0]['message']['content']
+                self.logger.debug(f'Generated: {msg}')
+                return msg
+
+            if raw_response.status_code == 401:
+                print('请求被拒绝 api_key错误')
+                continue
+            elif raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response)
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -161,17 +161,19 @@ class DLCRunner(BaseRunner):
            shell_cmd += 'umask 0000; '
            shell_cmd += '{task_cmd}'

-            tmpl = ('dlc create job'
-                    f" --command '{shell_cmd}'"
-                    f' --name {task_name[:512]}'
-                    ' --kind BatchJob'
-                    f" -c {self.aliyun_cfg['dlc_config_path']}"
-                    f" --workspace_id {self.aliyun_cfg['workspace_id']}"
-                    ' --worker_count 1'
-                    f' --worker_cpu {max(num_gpus * 8, 12)}'
-                    f' --worker_gpu {num_gpus}'
-                    f' --worker_memory {max(num_gpus * 128, 192)}'
-                    f" --worker_image {self.aliyun_cfg['worker_image']}")
+            tmpl = (
+                'dlc submit pytorchjob'
+                f" --command '{shell_cmd}'"
+                f' --name {task_name[:512]}'
+                f" --config {self.aliyun_cfg['dlc_config_path']}"
+                f" --workspace_id {self.aliyun_cfg['workspace_id']}"
+                f" --resource_id {self.aliyun_cfg['resource_id']}"
+                ' --workers 1'
+                f' --worker_cpu {max(num_gpus * 8, 12)}'
+                f' --worker_gpu {num_gpus}'
+                f' --worker_memory {max(num_gpus * 128, 192)}Gi'
+                f" --worker_image {self.aliyun_cfg['worker_image']}"
+                f" --data_sources {','.join(self.aliyun_cfg['data_sources'])}")
            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
@ -219,14 +221,9 @@ class DLCRunner(BaseRunner):
                pri_time = None
                initial_time = datetime.datetime.now()

-                url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com'  # noqa: E501
+                url = f"https://pai.console.aliyun.com/?regionId=cn-wulanchabu&workspaceId={self.aliyun_cfg['workspace_id']}#/dlc/jobs/{job_id}"  # noqa: E501
                logger = get_logger()
-                logger.debug('')
-                logger.debug('*' * 168)
-                logger.debug(
-                    f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail'  # noqa: E501
-                )
-                logger.debug('*' * 168)
+                logger.debug('\n' + '*' * 168 + '\n' + url + '\n' + '*' * 168)

                while True:
                    # 1. Avoid to request dlc too frequently.
@ -264,7 +261,7 @@ class DLCRunner(BaseRunner):
                    cur_time = (pod_create_time +
                                elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
                    logs_cmd = ('dlc logs'
-                                f' {job_id} {job_id}-worker-0'
+                                f' {job_id} {job_id}-master-0'
                                f" -c {self.aliyun_cfg['dlc_config_path']}"
                                f' --start_time {pri_time}'
                                f' --end_time {cur_time}')
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -84,7 +84,7 @@ def get_config_from_arg(args) -> Config:
        # set infer accelerator if needed
        if args.accelerator in ['vllm', 'lmdeploy']:
            config['models'] = change_accelerator(config['models'], args.accelerator)
-            if 'eval' in config and 'partitioner' in config['eval']:
+            if config.get('eval', {}).get('partitioner', {}).get('models') is not None:
                config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator)
            if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None:
                config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator)