vllm_hairuo/ihp/zoo/modeling_flash_attention_utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: jiangzhs <jiangzhs@inspur.com>
# @date: 2024/10/08
#

import inspect
import os
from typing import Optional
from typing import Tuple

import torch
import torch.nn.functional as F
from transformers.utils import is_flash_attn_2_available
from transformers.utils import is_flash_attn_greater_or_equal


if is_flash_attn_2_available():
    from flash_attn import flash_attn_func
    from flash_attn import flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis  # noqa
    from flash_attn.bert_padding import pad_input
    from flash_attn.bert_padding import unpad_input

    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)


def _get_unpad_data(
    attention_mask: torch.Tensor, cu_seqlens: torch.Tensor = None
) -> Tuple[torch.Tensor, torch.Tensor, int]:
    if cu_seqlens is not None:
        max_seqlen_in_batch = torch.max(cu_seqlens[1:] - cu_seqlens[:-1]).item()
        indices = torch.arange(0, cu_seqlens[-1].item(), device=cu_seqlens.device)
    else:
        seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
        indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
        max_seqlen_in_batch = seqlens_in_batch.max().item()
        cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (indices, cu_seqlens, max_seqlen_in_batch)


def _upad_input(
    query_layer: torch.Tensor,
    key_layer: torch.Tensor,
    value_layer: torch.Tensor,
    attention_mask: torch.Tensor,
    query_length: int,
    cu_seqlens,
):
    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask, cu_seqlens)
    batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

    key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
    value_layer = index_first_axis(
        value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
    )
    if query_length == kv_seq_len:
        query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, -1, head_dim), indices_k)
        cu_seqlens_q = cu_seqlens_k
        max_seqlen_in_batch_q = max_seqlen_in_batch_k
        indices_q = indices_k
    elif query_length == 1:
        max_seqlen_in_batch_q = 1
        cu_seqlens_q = torch.arange(
            batch_size + 1, dtype=torch.int32, device=query_layer.device
        )  # There is a memcpy here, that is very bad.
        indices_q = cu_seqlens_q[:-1]
        query_layer = query_layer.squeeze(1)
    else:
        # The -q_len: slice assumes left padding.
        attention_mask = attention_mask[:, -query_length:]
        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

    return (
        query_layer,
        key_layer,
        value_layer,
        indices_q,
        (cu_seqlens_q, cu_seqlens_k),
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
    )


def prepare_fa2_from_position_ids(query, key, value, position_ids):
    query = query.view(-1, query.size(-2), query.size(-1))
    key = key.view(-1, key.size(-2), key.size(-1))
    value = value.view(-1, value.size(-2), value.size(-1))
    position_ids = position_ids.flatten()
    indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)

    cu_seq_lens = torch.cat(
        (indices_q[position_ids == 0], torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32))
    )

    max_length = position_ids.max() + 1

    return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))


def _flash_attention_forward(
    query_states: torch.Tensor,
    key_states: torch.Tensor,
    value_states: torch.Tensor,
    attention_mask: torch.Tensor,
    query_length: int,
    is_causal: bool,
    dropout: float = 0.0,
    position_ids: Optional[torch.Tensor] = None,
    softmax_scale: Optional[float] = None,
    sliding_window: Optional[int] = None,
    use_top_left_mask: bool = False,
    softcap: Optional[float] = None,
    deterministic: bool = None,
    cu_seqlens=None,
):
    if not use_top_left_mask:
        causal = is_causal
    else:
        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__.
        causal = is_causal and query_length != 1

    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
    use_sliding_windows = (
        _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window
    )
    flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}

    if is_flash_attn_greater_or_equal("2.4.1"):
        if deterministic is None:
            deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
        flash_kwargs["deterministic"] = deterministic

    if softcap is not None:
        flash_kwargs["softcap"] = softcap

    # Contains at least one padding token in the sequence
    if attention_mask is not None or cu_seqlens is not None:
        batch_size = query_states.shape[0]
        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = _upad_input(
            query_states, key_states, value_states, attention_mask, query_length, cu_seqlens
        )
        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

        attn_output_unpad = flash_attn_varlen_func(
            query_states,
            key_states,
            value_states,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_k=cu_seqlens_k,
            max_seqlen_q=max_seqlen_in_batch_q,
            max_seqlen_k=max_seqlen_in_batch_k,
            dropout_p=dropout,
            softmax_scale=softmax_scale,
            causal=causal,
            **flash_kwargs,
        )
        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)

    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
    # Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always)
    elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
        batch_size = query_states.size(0)
        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
            query_states, key_states, value_states, position_ids
        )

        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

        attn_output = flash_attn_varlen_func(
            query_states,
            key_states,
            value_states,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_k=cu_seqlens_k,
            max_seqlen_q=max_seqlen_in_batch_q,
            max_seqlen_k=max_seqlen_in_batch_k,
            dropout_p=dropout,
            softmax_scale=softmax_scale,
            causal=causal,
            **flash_kwargs,
        )

        attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))

    else:
        attn_output = flash_attn_func(
            query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs
        )

    return attn_output
feat: first submit 2024-10-25 17:16:26 +08:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# Copyright @2024 AI. Inspur Inc.`
			`#`
			`# @author: jiangzhs <jiangzhs@inspur.com>`
			`# @date: 2024/10/08`
			`#`

			`import inspect`
			`import os`
			`from typing import Optional`
			`from typing import Tuple`

			`import torch`
			`import torch.nn.functional as F`
			`from transformers.utils import is_flash_attn_2_available`
			`from transformers.utils import is_flash_attn_greater_or_equal`


			`if is_flash_attn_2_available():`
			`from flash_attn import flash_attn_func`
			`from flash_attn import flash_attn_varlen_func`
			`from flash_attn.bert_padding import index_first_axis # noqa`
			`from flash_attn.bert_padding import pad_input`
			`from flash_attn.bert_padding import unpad_input`

			`_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)`


			`def _get_unpad_data(`
			`attention_mask: torch.Tensor, cu_seqlens: torch.Tensor = None`
			`) -> Tuple[torch.Tensor, torch.Tensor, int]:`
			`if cu_seqlens is not None:`
			`max_seqlen_in_batch = torch.max(cu_seqlens[1:] - cu_seqlens[:-1]).item()`
			`indices = torch.arange(0, cu_seqlens[-1].item(), device=cu_seqlens.device)`
			`else:`
			`seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)`
			`indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()`
			`max_seqlen_in_batch = seqlens_in_batch.max().item()`
			`cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))`
			`return (indices, cu_seqlens, max_seqlen_in_batch)`


			`def _upad_input(`
			`query_layer: torch.Tensor,`
			`key_layer: torch.Tensor,`
			`value_layer: torch.Tensor,`
			`attention_mask: torch.Tensor,`
			`query_length: int,`
			`cu_seqlens,`
			`):`
			`indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask, cu_seqlens)`
			`batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape`

			`key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)`
			`value_layer = index_first_axis(`
			`value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k`
			`)`
			`if query_length == kv_seq_len:`
			`query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, -1, head_dim), indices_k)`
			`cu_seqlens_q = cu_seqlens_k`
			`max_seqlen_in_batch_q = max_seqlen_in_batch_k`
			`indices_q = indices_k`
			`elif query_length == 1:`
			`max_seqlen_in_batch_q = 1`
			`cu_seqlens_q = torch.arange(`
			`batch_size + 1, dtype=torch.int32, device=query_layer.device`
			`) # There is a memcpy here, that is very bad.`
			`indices_q = cu_seqlens_q[:-1]`
			`query_layer = query_layer.squeeze(1)`
			`else:`
			`# The -q_len: slice assumes left padding.`
			`attention_mask = attention_mask[:, -query_length:]`
			`query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)`

			`return (`
			`query_layer,`
			`key_layer,`
			`value_layer,`
			`indices_q,`
			`(cu_seqlens_q, cu_seqlens_k),`
			`(max_seqlen_in_batch_q, max_seqlen_in_batch_k),`
			`)`


			`def prepare_fa2_from_position_ids(query, key, value, position_ids):`
			`query = query.view(-1, query.size(-2), query.size(-1))`
			`key = key.view(-1, key.size(-2), key.size(-1))`
			`value = value.view(-1, value.size(-2), value.size(-1))`
			`position_ids = position_ids.flatten()`
			`indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)`

			`cu_seq_lens = torch.cat(`
			`(indices_q[position_ids == 0], torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32))`
			`)`

			`max_length = position_ids.max() + 1`

			`return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))`


			`def _flash_attention_forward(`
			`query_states: torch.Tensor,`
			`key_states: torch.Tensor,`
			`value_states: torch.Tensor,`
			`attention_mask: torch.Tensor,`
			`query_length: int,`
			`is_causal: bool,`
			`dropout: float = 0.0,`
			`position_ids: Optional[torch.Tensor] = None,`
			`softmax_scale: Optional[float] = None,`
			`sliding_window: Optional[int] = None,`
			`use_top_left_mask: bool = False,`
			`softcap: Optional[float] = None,`
			`deterministic: bool = None,`
			`cu_seqlens=None,`
			`):`
			`if not use_top_left_mask:`
			`causal = is_causal`
			`else:`
			# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__.
			`causal = is_causal and query_length != 1`

			`# Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).`
			`use_sliding_windows = (`
			`_flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window`
			`)`
			`flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}`

			`if is_flash_attn_greater_or_equal("2.4.1"):`
			`if deterministic is None:`
			`deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"`
			`flash_kwargs["deterministic"] = deterministic`

			`if softcap is not None:`
			`flash_kwargs["softcap"] = softcap`

			`# Contains at least one padding token in the sequence`
			`if attention_mask is not None or cu_seqlens is not None:`
			`batch_size = query_states.shape[0]`
			`query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = _upad_input(`
			`query_states, key_states, value_states, attention_mask, query_length, cu_seqlens`
			`)`
			`cu_seqlens_q, cu_seqlens_k = cu_seq_lens`
			`max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens`

			`attn_output_unpad = flash_attn_varlen_func(`
			`query_states,`
			`key_states,`
			`value_states,`
			`cu_seqlens_q=cu_seqlens_q,`
			`cu_seqlens_k=cu_seqlens_k,`
			`max_seqlen_q=max_seqlen_in_batch_q,`
			`max_seqlen_k=max_seqlen_in_batch_k,`
			`dropout_p=dropout,`
			`softmax_scale=softmax_scale,`
			`causal=causal,`
			`**flash_kwargs,`
			`)`
			`attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)`

			`# If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing`
			`# then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.`
			# Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
			# Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always)
			`elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():`
			`batch_size = query_states.size(0)`
			`query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(`
			`query_states, key_states, value_states, position_ids`
			`)`

			`cu_seqlens_q, cu_seqlens_k = cu_seq_lens`
			`max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens`

			`attn_output = flash_attn_varlen_func(`
			`query_states,`
			`key_states,`
			`value_states,`
			`cu_seqlens_q=cu_seqlens_q,`
			`cu_seqlens_k=cu_seqlens_k,`
			`max_seqlen_q=max_seqlen_in_batch_q,`
			`max_seqlen_k=max_seqlen_in_batch_k,`
			`dropout_p=dropout,`
			`softmax_scale=softmax_scale,`
			`causal=causal,`
			`**flash_kwargs,`
			`)`

			`attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))`

			`else:`
			`attn_output = flash_attn_func(`
			`query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs`
			`)`

			`return attn_output`