AutoRig/Dl/UniRig/PerceiverEncoder.cs

Neural network encoder for a point-cloud Perceiver model configured by UniRig. It projects points and normals with Fourier features, runs a cross-attention block from sampled queries to all data, then multiple self-attention blocks and a final layer norm. Includes utilities to build input tensors and de-interleave fused per-head QKV/KV projections.

Native Interop
using AutoRig.Dl.Nn;

namespace AutoRig.Dl.UniRig;

using AutoRig.Dl;
using Vector3 = System.Numerics.Vector3;

/// <summary>
/// Michelangelo point-cloud perceiver encoder as configured by UniRig
/// (sal_perceiver.py, query_method=False): fourier-embedded points through an
/// input projection, one cross-attention block (sampled points ← all points),
/// a 16-layer pre-LN self-attention stack, and a post layer norm.
/// IMPORTANT: fused qkv/kv projections are PER-HEAD INTERLEAVED
/// (view(n, heads, 3·headDim).split) — de-interleaved here into the
/// head-major layout TransformerOps.Attention expects.
/// </summary>
public sealed class PerceiverSelfBlock
{
    public required Tensor QkvWeight;            // [3·width, width], NO bias (qkv_bias=False)
    public required Tensor ProjWeight, ProjBias; // c_proj
    public required Tensor Ln1Gamma, Ln1Beta;
    public required Tensor Ln2Gamma, Ln2Beta;
    public required Tensor FcWeight, FcBias;     // c_fc (width·4)
    public required Tensor Fc2Weight, Fc2Bias;   // mlp c_proj

    public Tensor Forward( Tensor x, int heads )
    {
        var normed = TransformerOps.LayerNorm( x, Ln1Gamma, Ln1Beta );
        var qkv = normed.MatMul( QkvWeight.Transposed );
        var (q, k, v) = PerceiverEncoder.SplitInterleaved( qkv, heads, 3 );
        var attended = TransformerOps.Attention( q, k, v, heads, causal: false );
        x = x.Add( attended.MatMul( ProjWeight.Transposed ).Add( ProjBias ) );

        var mlpIn = TransformerOps.LayerNorm( x, Ln2Gamma, Ln2Beta );
        var h = TransformerOps.Gelu( mlpIn.MatMul( FcWeight.Transposed ).Add( FcBias ) );
        return x.Add( h.MatMul( Fc2Weight.Transposed ).Add( Fc2Bias ) );
    }
}

public sealed class PerceiverCrossBlock
{
    public required Tensor QWeight;              // [width, width], no bias
    public required Tensor KvWeight;             // [2·width, dataWidth], no bias
    public required Tensor ProjWeight, ProjBias;
    public required Tensor Ln1Gamma, Ln1Beta;    // on queries
    public required Tensor Ln2Gamma, Ln2Beta;    // on data
    public required Tensor Ln3Gamma, Ln3Beta;    // pre-MLP
    public required Tensor FcWeight, FcBias, Fc2Weight, Fc2Bias;

    public Tensor Forward( Tensor x, Tensor data, int heads )
    {
        var q = TransformerOps.LayerNorm( x, Ln1Gamma, Ln1Beta )
            .MatMul( QWeight.Transposed );
        var kv = TransformerOps.LayerNorm( data, Ln2Gamma, Ln2Beta )
            .MatMul( KvWeight.Transposed );
        var (k, v, _) = PerceiverEncoder.SplitInterleaved( kv, heads, 2 );
        var attended = TransformerOps.Attention( q, k, v, heads, causal: false );
        x = x.Add( attended.MatMul( ProjWeight.Transposed ).Add( ProjBias ) );

        var mlpIn = TransformerOps.LayerNorm( x, Ln3Gamma, Ln3Beta );
        var h = TransformerOps.Gelu( mlpIn.MatMul( FcWeight.Transposed ).Add( FcBias ) );
        return x.Add( h.MatMul( Fc2Weight.Transposed ).Add( Fc2Bias ) );
    }
}

public sealed class PerceiverEncoder
{
    public const int NumFreqs = 8;

    /// <summary>UniRig/SkinTokens use 8; MagicArticulate's ShapeVAE uses 12.</summary>
    public int Heads { get; init; } = 8;

    public required Tensor InputProjWeight, InputProjBias;   // [width, 54]
    public required PerceiverCrossBlock Cross;
    public required PerceiverSelfBlock[] SelfBlocks;
    public required Tensor LnPostGamma, LnPostBeta;

    /// <summary>
    /// Encodes an already-subsampled cloud: <paramref name="prePoints"/> = the
    /// seeded 4096-point subset, <paramref name="sampledIndices"/> = fps picks
    /// (1024) into it. Returns [1024, width] latents.
    /// </summary>
    public Tensor Encode( Vector3[] prePoints, Vector3[] preNormals, int[] sampledIndices )
    {
        var data = Project( prePoints, preNormals );
        var sampledPoints = new Vector3[sampledIndices.Length];
        var sampledNormals = new Vector3[sampledIndices.Length];
        for ( var i = 0; i < sampledIndices.Length; i++ )
        {
            sampledPoints[i] = prePoints[sampledIndices[i]];
            sampledNormals[i] = preNormals[sampledIndices[i]];
        }
        var query = Project( sampledPoints, sampledNormals );

        var latents = Cross.Forward( query, data, Heads );
        foreach ( var block in SelfBlocks )
            latents = block.Forward( latents, Heads );
        return TransformerOps.LayerNorm( latents, LnPostGamma, LnPostBeta );
    }

    /// <summary>input_proj(cat[fourier(points), normals]): fourier layout is
    /// [x,y,z, sin(x·f0..f7), sin(y·..), sin(z·..), cos(...)] (51) + normals (3).</summary>
    internal Tensor Project( Vector3[] points, Vector3[] normals )
    {
        const int fourierDim = 3 * (NumFreqs * 2 + 1);
        var rows = points.Length;
        var input = new float[rows * (fourierDim + 3)];
        for ( var r = 0; r < rows; r++ )
        {
            var at = r * (fourierDim + 3);
            var p = new[] { points[r].X, points[r].Y, points[r].Z };
            input[at + 0] = p[0];
            input[at + 1] = p[1];
            input[at + 2] = p[2];
            for ( var d = 0; d < 3; d++ )
                for ( var f = 0; f < NumFreqs; f++ )
                {
                    var scaled = p[d] * (1 << f);   // 2^f, include_pi = False
                    input[at + 3 + d * NumFreqs + f] = MathF.Sin( scaled );
                    input[at + 3 + 24 + d * NumFreqs + f] = MathF.Cos( scaled );
                }
            input[at + fourierDim + 0] = normals[r].X;
            input[at + fourierDim + 1] = normals[r].Y;
            input[at + fourierDim + 2] = normals[r].Z;
        }
        return Tensor.From( input, rows, fourierDim + 3 )
            .MatMul( InputProjWeight.Transposed ).Add( InputProjBias );
    }

    /// <summary>
    /// De-interleaves a fused projection [n, groups·width] whose column layout is
    /// per-head (head, group, headDim) — as produced by view(n, heads, groups·hd)
    /// + split — into `groups` head-major [n, width] tensors (third slot unused
    /// when groups is 2).
    /// </summary>
    internal static (Tensor A, Tensor B, Tensor C) SplitInterleaved(
        Tensor fused, int heads, int groups )
    {
        var rows = fused.Shape[0];
        var width = fused.Shape[1] / groups;
        var headDim = width / heads;
        var outputs = new float[3][];
        for ( var g = 0; g < groups; g++ )
            outputs[g] = new float[rows * width];

        for ( var r = 0; r < rows; r++ )
            for ( var h = 0; h < heads; h++ )
                for ( var g = 0; g < groups; g++ )
                    Array.Copy(
                        fused.Data, r * groups * width + h * groups * headDim + g * headDim,
                        outputs[g], r * width + h * headDim,
                        headDim );

        return (
            Tensor.From( outputs[0], rows, width ),
            Tensor.From( outputs[1], rows, width ),
            groups > 2 ? Tensor.From( outputs[2], rows, width ) : null);
    }
}