feat: chunkable mamba2 model code

Browse files

Files changed (5) hide show

.gitattributes +35 -0
README.md +63 -0
chunkable_ssd_combined.py +511 -0
configuration_chunkable_mamba2.py +12 -0
modeling_chunkable_mamba2.py +131 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+license: apache-2.0
+library_name: transformers
+tags:
+- transformers
+- mamba2
+- vertical-chunking
+---
+# chunkable-mamba2
+Custom [Mamba2](https://arxiv.org/abs/2405.21060) model and configuration classes for 🤗 Transformers that add support for **vertically chunked inference**, which processes input sequences in fixed-size vertical chunks through all model layers with constant memory usage, regardless of sequence length.
+## What this repository provides
+- **`ChunkableMamba2Config`:** extends `Mamba2Config` with a `use_mem_eff_path` option for the memory-efficient CUDA kernel path.
+- **`ChunkableMamba2Model`:** extends `Mamba2Model` with a chunkable mixer and cache that correctly propagate the recurrent states across vertical chunks (simultaneous `seq_idx` + `initial_states` support).
+- **`chunkable_mamba_split_conv1d_scan_combined`:** modified `mamba_split_conv1d_scan_combined` kernel wrapper that passes cache parameters through the SSD scan so that conv and SSM states are properly initialized and exported during chunked inference.
+## Usage
+This repository is designed to be referenced directly from Hugging Face model configs via `auto_map`, so that models can be loaded with `trust_remote_code=True` without any local installation:
+```json
+"auto_map": {
+    "AutoConfig": "dynatrace-oss/chunkable-mamba2--configuration_chunkable_mamba2.ChunkableMamba2Config",
+    "AutoModel": "dynatrace-oss/chunkable-mamba2--modeling_chunkable_mamba2.ChunkableMamba2Model"
+}
+```
+## Models
+This code was created for the following embedding models:
+- [dynatrace-oss/llama-embed-mamba2-7b](https://huggingface.co/dynatrace-oss/llama-embed-mamba2-7b)
+- [dynatrace-oss/llama-embed-mamba2-1.3b](https://huggingface.co/dynatrace-oss/llama-embed-mamba2-1.3b)
+## Requirements
+> [!IMPORTANT]
+> Requires `transformers>=5.5.0` due to a breaking change to the cache of Mamba2 introduced in `v5.5.0` ([transformers#44950](https://github.com/huggingface/transformers/pull/44950)).
+```bash
+pip install transformers kernels einops
+```
+## Open Source Integration Roadmap
+Our goal is to integrate all necessary changes to simplify the adoption of vertically chunked inference for other models:
+> [!Note]
+> ⚪ Planned | 🟡 In Progress | 🟢 Integrated
+- ⚪ **causal-conv1d:** Enable simultaneous `seq_idx` + `initial_states` (required for recurrent processing of chunks with left padding)
+- ⚪ **mamba-ssm:** Use `seq_idx` + `initial_states` in `mamba_split_conv1d_scan_combined` and export final states
+- ⚪ **kernels-community:** Propagate changes in `causal-conv1d` and `mamba-ssm` to their kernel hub equivalents in the `kernels-community` repositories
+- ⚪ **transformers:** Use updated `mamba_split_conv1d_scan_combined` with cache params during inference (currently only used during training, not configurable, problems with left padding)
+*This list will be updated as integration progresses.*
+## License
+Apache-2.0

chunkable_ssd_combined.py ADDED Viewed

	@@ -0,0 +1,511 @@

+# Copyright (c) 2024, Tri Dao, Albert Gu.
+"""We want triton==2.1.0 or 2.2.0 for this"""
+from packaging import version
+import torch
+import torch.nn.functional as F
+import triton
+from einops import rearrange
+from transformers.integrations.hub_kernels import get_kernel
+# Fixed revisions because kernels after 2026-04-14 do not expose the functions we need anymore.
+causal_conv1d = get_kernel("kernels-community/causal-conv1d", revision="dc7072f0e9d799b247a2517a909ebb209d50bea0")
+mamba_ssm = get_kernel("kernels-community/mamba-ssm", revision="00b2ecd499379f9bcf969b6796e53bc867f4ad38")
+causal_conv1d_fwd_function = causal_conv1d.cpp_functions.causal_conv1d_fwd_function
+causal_conv1d_bwd_function = causal_conv1d.cpp_functions.causal_conv1d_bwd_function
+custom_fwd = mamba_ssm.utils.torch.custom_fwd
+custom_bwd = mamba_ssm.utils.torch.custom_bwd
+_layer_norm_fwd = mamba_ssm.ops.triton.layernorm_gated._layer_norm_fwd
+_layer_norm_bwd = mamba_ssm.ops.triton.layernorm_gated._layer_norm_bwd
+_swiglu_fwd = mamba_ssm.ops.triton.k_activations._swiglu_fwd
+_swiglu_bwd = mamba_ssm.ops.triton.k_activations._swiglu_bwd
+rearrange_and_update_stride = mamba_ssm.ops.triton.ssd_combined.rearrange_and_update_stride
+_mamba_chunk_scan_combined_fwd = mamba_ssm.ops.triton.ssd_combined._mamba_chunk_scan_combined_fwd
+_mamba_chunk_scan_combined_bwd = mamba_ssm.ops.triton.ssd_combined._mamba_chunk_scan_combined_bwd
+TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
+class ChunkableMambaSplitConv1dScanCombinedFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        zxbcdt,
+        conv1d_weight,
+        conv1d_bias,
+        dt_bias,
+        A,
+        D,
+        chunk_size,
+        initial_conv_states=None,
+        initial_ssm_states=None,
+        seq_idx=None,
+        dt_limit=(0.0, float("inf")),
+        return_final_states=False,
+        activation="silu",
+        rmsnorm_weight=None,
+        rmsnorm_eps=1e-6,
+        outproj_weight=None,
+        outproj_bias=None,
+        headdim=None,
+        ngroups=1,
+        norm_before_gate=True,
+    ):
+        assert activation in [None, "silu", "swish"]
+        if D.dim() == 1:
+            assert headdim is not None
+            (nheads,) = D.shape
+        else:
+            nheads, headdim = D.shape
+        batch, seqlen, _ = zxbcdt.shape
+        dim = nheads * headdim
+        assert nheads % ngroups == 0
+        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
+        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
+        assert d_nonssm >= 0
+        assert zxbcdt.shape == (
+            batch,
+            seqlen,
+            2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads,
+        )
+        assert dt_bias.shape == (nheads,)
+        assert A.shape == (nheads,)
+        zx0, z, xBC, dt = torch.split(
+            zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1
+        )
+        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
+        final_conv_states = (
+            torch.empty(
+                (batch, conv1d_weight.shape[1] - 1, dim + ngroups * dstate * 2),
+                device=xBC.device,
+                dtype=xBC.dtype,
+            ).transpose(1, 2)
+            if return_final_states
+            else None
+        )
+        # Workaround because causal_conv1d_fwd_function currently does not support seq_idx when initial_conv_states is not None.
+        # Additionally, there is a bug in causal_conv1d_fwd_function when seq_idx is used causing illegal memory access:
+        # - Issue: https://github.com/Dao-AILab/causal-conv1d/issues/67
+        # - PR: https://github.com/Dao-AILab/causal-conv1d/pull/101
+        if seq_idx is not None and initial_conv_states is not None:
+            xBC = xBC * (seq_idx.unsqueeze(-1) >= 0).to(xBC.dtype)
+        xBC_conv = rearrange(
+            causal_conv1d_fwd_function(
+                rearrange_and_update_stride(xBC, "b s d -> b d s"),
+                conv1d_weight,
+                conv1d_bias,
+                None,
+                initial_conv_states,
+                final_conv_states,
+                activation in ["silu", "swish"],
+            ),
+            "b d s -> b s d",
+        )
+        if seq_idx is not None and initial_conv_states is not None:
+            xBC_conv = xBC_conv * (seq_idx.unsqueeze(-1) >= 0).to(xBC_conv.dtype)
+        x, B, C = torch.split(
+            xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1
+        )
+        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
+        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
+        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
+        if rmsnorm_weight is None:
+            out, out_x, dt_out, dA_cumsum, states, final_ssm_states = (
+                _mamba_chunk_scan_combined_fwd(
+                    x,
+                    dt,
+                    A,
+                    B,
+                    C,
+                    chunk_size=chunk_size,
+                    D=D,
+                    z=z,
+                    dt_bias=dt_bias,
+                    initial_states=initial_ssm_states,
+                    seq_idx=seq_idx,
+                    dt_softplus=True,
+                    dt_limit=dt_limit,
+                )
+            )
+            out = rearrange(out, "b s h p -> b s (h p)")
+            rstd = None
+            if d_nonssm > 0:
+                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
+        else:
+            out_x, _, dt_out, dA_cumsum, states, final_ssm_states = (
+                _mamba_chunk_scan_combined_fwd(
+                    x,
+                    dt,
+                    A,
+                    B,
+                    C,
+                    chunk_size=chunk_size,
+                    D=D,
+                    z=None,
+                    dt_bias=dt_bias,
+                    initial_states=initial_ssm_states,
+                    seq_idx=seq_idx,
+                    dt_softplus=True,
+                    dt_limit=dt_limit,
+                )
+            )
+            # reshape input data into 2D tensor
+            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
+            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
+            rmsnorm_weight = rmsnorm_weight.contiguous()
+            if d_nonssm == 0:
+                out = None
+            else:
+                out01 = torch.empty(
+                    (batch, seqlen, d_nonssm + dim),
+                    dtype=x_rms.dtype,
+                    device=x_rms.device,
+                )
+                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
+                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
+            out, _, rstd = _layer_norm_fwd(
+                x_rms,
+                rmsnorm_weight,
+                None,
+                rmsnorm_eps,
+                z_rms,
+                out=out,
+                group_size=dim // ngroups,
+                norm_before_gate=norm_before_gate,
+                is_rms_norm=True,
+            )
+            if d_nonssm == 0:
+                out = rearrange(out, "(b s) d -> b s d", b=batch)
+            else:
+                out = out01
+        ctx.outproj_weight_dtype = (
+            outproj_weight.dtype if outproj_weight is not None else None
+        )
+        if outproj_weight is not None:
+            if torch.is_autocast_enabled():
+                dtype = torch.get_autocast_gpu_dtype()
+                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
+                outproj_bias = (
+                    outproj_bias.to(dtype) if outproj_bias is not None else None
+                )
+            out = F.linear(out, outproj_weight, outproj_bias)
+        else:
+            assert outproj_bias is None
+        if out is not None and seq_idx is not None:
+            out = out * (seq_idx.unsqueeze(-1) >= 0).to(out.dtype)
+        ctx.save_for_backward(
+            zxbcdt,
+            conv1d_weight,
+            conv1d_bias,
+            out_x,
+            A,
+            D,
+            dt_bias,
+            initial_conv_states,
+            initial_ssm_states,
+            seq_idx,
+            rmsnorm_weight,
+            rstd,
+            outproj_weight,
+            outproj_bias,
+        )
+        ctx.dt_limit = dt_limit
+        ctx.return_final_states = return_final_states
+        ctx.activation = activation
+        ctx.rmsnorm_eps = rmsnorm_eps
+        ctx.norm_before_gate = norm_before_gate
+        ctx.chunk_size = chunk_size
+        ctx.headdim = headdim
+        ctx.ngroups = ngroups
+        return (
+            out
+            if not return_final_states
+            else (out, final_conv_states, final_ssm_states)
+        )
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout, *args):
+        (
+            zxbcdt,
+            conv1d_weight,
+            conv1d_bias,
+            out,
+            A,
+            D,
+            dt_bias,
+            initial_conv_states,
+            initial_ssm_states,
+            seq_idx,
+            rmsnorm_weight,
+            rstd,
+            outproj_weight,
+            outproj_bias,
+        ) = ctx.saved_tensors
+        dfinal_states = args[0] if ctx.return_final_states else None
+        headdim = ctx.headdim
+        nheads = D.shape[0]
+        dim = nheads * headdim
+        assert nheads % ctx.ngroups == 0
+        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
+        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
+        assert d_nonssm >= 0
+        recompute_output = outproj_weight is not None
+        if recompute_output:
+            out_recompute = torch.empty(
+                *out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype
+            )
+            out0_recompute, out1_recompute = out_recompute.split(
+                [d_nonssm, dim], dim=-1
+            )
+        zx0, z, xBC, dt = torch.split(
+            zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
+        )
+        # Recompute x, B, C
+        xBC_conv = rearrange(
+            causal_conv1d_fwd_function(
+                rearrange_and_update_stride(xBC, "b s d -> b d s"),
+                conv1d_weight,
+                conv1d_bias,
+                None,
+                initial_conv_states,
+                None,
+                ctx.activation in ["silu", "swish"],
+            ),
+            "b d s -> b s d",
+        )
+        x, B, C = torch.split(
+            xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
+        )
+        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
+        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
+        dzxbcdt = torch.empty_like(zxbcdt)
+        dzx0, dz, dxBC_given, ddt_given = torch.split(
+            dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
+        )
+        dxBC = torch.empty_like(xBC)
+        dx, dB, dC = torch.split(
+            dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
+        )
+        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
+        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
+        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
+        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
+        if outproj_weight is not None:
+            dout_og = dout
+            dout = F.linear(dout, outproj_weight.t())
+        if d_nonssm > 0:
+            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
+            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
+        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
+        if rmsnorm_weight is None:
+            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
+            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_ssm_states, *rest = (
+                _mamba_chunk_scan_combined_bwd(
+                    dout,
+                    x,
+                    dt,
+                    A,
+                    B,
+                    C,
+                    out,
+                    ctx.chunk_size,
+                    D=D,
+                    z=z,
+                    dt_bias=dt_bias,
+                    initial_states=initial_ssm_states,
+                    dfinal_states=dfinal_states,
+                    seq_idx=seq_idx,
+                    dt_softplus=True,
+                    dt_limit=ctx.dt_limit,
+                    dx=dx,
+                    ddt=ddt_given,
+                    dB=dB,
+                    dC=dC,
+                    dz=dz,
+                    recompute_output=recompute_output,
+                )
+            )
+            out_for_linear = (
+                rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
+            )
+            drmsnorm_weight = None
+        else:
+            batch = dout.shape[0]
+            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
+            dz = rearrange(dz, "b l d -> (b l) d")
+            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
+            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
+            out1_recompute = (
+                rearrange(out1_recompute, "b s d -> (b s) d")
+                if recompute_output
+                else None
+            )
+            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(
+                dy_rms,
+                x_rms,
+                rmsnorm_weight,
+                None,
+                ctx.rmsnorm_eps,
+                None,
+                rstd,
+                z_rms,
+                group_size=dim // ctx.ngroups,
+                norm_before_gate=ctx.norm_before_gate,
+                is_rms_norm=True,
+                recompute_output=recompute_output,
+                dz=dz,
+                out=out1_recompute if recompute_output else None,
+            )
+            out_for_linear = out_recompute if recompute_output else None
+            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
+            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_ssm_states = (
+                _mamba_chunk_scan_combined_bwd(
+                    dout,
+                    x,
+                    dt,
+                    A,
+                    B,
+                    C,
+                    out,
+                    ctx.chunk_size,
+                    D=D,
+                    z=None,
+                    dt_bias=dt_bias,
+                    initial_states=initial_ssm_states,
+                    dfinal_states=dfinal_states,
+                    seq_idx=seq_idx,
+                    dt_softplus=True,
+                    dt_limit=ctx.dt_limit,
+                    dx=dx,
+                    ddt=ddt_given,
+                    dB=dB,
+                    dC=dC,
+                )
+            )
+        if outproj_weight is not None:
+            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
+            doutproj_bias = (
+                dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
+            )
+        else:
+            doutproj_weight, doutproj_bias = None, None
+        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
+        dxBC_given_update, dweight, dbias, dinitial_conv_states, *_ = (
+            causal_conv1d_bwd_function(
+                rearrange_and_update_stride(xBC, "b s d -> b d s"),
+                conv1d_weight,
+                conv1d_bias,
+                rearrange(dxBC, "b s d -> b d s"),
+                # seq_idx,
+                seq_idx if initial_conv_states is None else None,
+                initial_conv_states,
+                None,
+                rearrange_and_update_stride(dxBC_given),
+                True,
+                ctx.activation in ["silu", "swish"],
+            )
+        )
+        if dxBC_given.stride() != dxBC_given_update.stride():
+            dxBC_given.copy_(dxBC_given_update)
+        else:
+            dxBC_given = dxBC_given_update
+        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
+        return (
+            dzxbcdt,
+            dweight,
+            dbias,
+            ddt_bias,
+            dA,
+            dD,
+            None,
+            dinitial_conv_states,
+            dinitial_ssm_states,
+            None,
+            None,
+            None,
+            None,
+            drmsnorm_weight,
+            None,
+            doutproj_weight,
+            doutproj_bias,
+            None,
+            None,
+            None,
+        )
+def chunkable_mamba_split_conv1d_scan_combined(
+    zxbcdt,
+    conv1d_weight,
+    conv1d_bias,
+    dt_bias,
+    A,
+    D,
+    chunk_size,
+    initial_conv_states=None,
+    initial_ssm_states=None,
+    seq_idx=None,
+    dt_limit=(0.0, float("inf")),
+    return_final_states=False,
+    activation="silu",
+    rmsnorm_weight=None,
+    rmsnorm_eps=1e-6,
+    outproj_weight=None,
+    outproj_bias=None,
+    headdim=None,
+    ngroups=1,
+    norm_before_gate=True,
+):
+    """
+    Argument:
+        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
+        conv1d_weight: (dim + 2 * ngroups * dstate, width)
+        conv1d_bias: (dim + 2 * ngroups * dstate,)
+        dt_bias: (nheads,)
+        A: (nheads)
+        D: (nheads, headdim) or (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen), int32
+        rmsnorm_weight: (dim,)
+        outproj_weight: (out_dim, dim)
+        outproj_bias: (out_dim,)
+        headdim: if D is 1D, headdim must be passed in
+        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
+    Return:
+        out: (batch, seqlen, dim)
+    """
+    return ChunkableMambaSplitConv1dScanCombinedFn.apply(
+        zxbcdt,
+        conv1d_weight,
+        conv1d_bias,
+        dt_bias,
+        A,
+        D,
+        chunk_size,
+        initial_conv_states,
+        initial_ssm_states,
+        seq_idx,
+        dt_limit,
+        return_final_states,
+        activation,
+        rmsnorm_weight,
+        rmsnorm_eps,
+        outproj_weight,
+        outproj_bias,
+        headdim,
+        ngroups,
+        norm_before_gate,
+    )

configuration_chunkable_mamba2.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers.models.mamba2.configuration_mamba2 import Mamba2Config
+class ChunkableMamba2Config(Mamba2Config):
+    def __init__(
+        self,
+        *args,
+        use_mem_eff_path: bool = True,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.use_mem_eff_path = use_mem_eff_path

modeling_chunkable_mamba2.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from .configuration_chunkable_mamba2 import ChunkableMamba2Config
+from transformers.cache_utils import Cache, is_torchdynamo_compiling
+from transformers.models.mamba2.modeling_mamba2 import (
+    Mamba2Block,
+    Mamba2Mixer,
+    Mamba2Model,
+    Mamba2RMSNorm,
+    apply_mask_to_padding_states,
+)
+import torch
+from torch import nn
+mamba_split_conv1d_scan_combined = None
+class ChunkableMamba2Mixer(Mamba2Mixer):
+    def __init__(self, config: ChunkableMamba2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.use_mem_eff_path = config.use_mem_eff_path
+        global mamba_split_conv1d_scan_combined
+        if self.use_mem_eff_path and mamba_split_conv1d_scan_combined is None:
+            from .chunkable_ssd_combined import chunkable_mamba_split_conv1d_scan_combined
+            mamba_split_conv1d_scan_combined = chunkable_mamba_split_conv1d_scan_combined
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Cache | None = None,
+        attention_mask: torch.Tensor | None = None,
+    ):
+        if (
+            cache_params is not None
+            and cache_params.has_previous_state(self.layer_idx)
+        ) and not self.use_mem_eff_path:
+            return super().cuda_kernels_forward(
+                hidden_states=hidden_states,
+                cache_params=cache_params,
+                attention_mask=attention_mask,
+            )
+        # 1. Gated MLP's linear projection
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask[:, -hidden_states.size(1):])
+        projected_states = self.in_proj(hidden_states)
+        A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+        dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+        seq_idx = (
+            (attention_mask[:, -hidden_states.size(1) :] - 1).to(torch.int32)
+            if attention_mask is not None
+            else None
+        )
+        # 2-4. Fused kernel for conv1d, SSM, and the final projection
+        out = mamba_split_conv1d_scan_combined(
+            projected_states,
+            self.conv1d.weight.squeeze(1),
+            self.conv1d.bias,
+            self.dt_bias,
+            A,
+            D=self.D,
+            chunk_size=self.chunk_size,
+            seq_idx=seq_idx,
+            activation=self.activation,
+            rmsnorm_weight=self.norm.weight,
+            rmsnorm_eps=self.norm.variance_epsilon,
+            outproj_weight=self.out_proj.weight,
+            outproj_bias=self.out_proj.bias,
+            headdim=self.head_dim,
+            ngroups=self.n_groups,
+            norm_before_gate=False,
+            initial_conv_states=cache_params.layers[self.layer_idx].conv_states
+            if cache_params is not None
+            else None,
+            initial_ssm_states=cache_params.layers[self.layer_idx].recurrent_states
+            if cache_params is not None
+            else None,
+            return_final_states=cache_params is not None,
+            **dt_limit_kwargs,
+        )
+        if cache_params is not None:
+            out, conv_states, ssm_state = out
+            cache_params.layers[self.layer_idx].has_previous_state = False
+            cache_params.update_conv_state(conv_states, layer_idx=self.layer_idx)
+            cache_params.update_recurrent_state(ssm_state, layer_idx=self.layer_idx)
+        return out
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Cache | None = None,
+        attention_mask: torch.Tensor | None = None,
+    ):
+        if "cuda" in self.in_proj.weight.device.type and not is_torchdynamo_compiling():
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+        return self.torch_forward(hidden_states, cache_params, attention_mask)
+class ChunkableMamba2Block(Mamba2Block):
+    def __init__(self, config, layer_idx):
+        super(Mamba2Block, self).__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = Mamba2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = ChunkableMamba2Mixer(config, layer_idx=layer_idx)
+class ChunkableMamba2Model(Mamba2Model):
+    config_class = ChunkableMamba2Config
+    def __init__(self, config):
+        super(Mamba2Model, self).__init__(config)
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [
+                ChunkableMamba2Block(config, layer_idx=idx)
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+        self.norm_f = Mamba2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self._register_load_state_dict_pre_hook(self.load_hook)
+        self.post_init()