Neural network encoder for a point-cloud Perceiver used by UniRig. Projects Fourier-embedded point positions and normals, runs a cross-attention block from sampled queries to all data, then applies a stack of self-attention blocks and a final layer norm. Includes utilities for projecting inputs and de-interleaving fused per-head qkv/kv tensors.
using AutoRig.Dl.Nn;
namespace AutoRig.Dl.UniRig;
using AutoRig.Dl;
using Vector3 = System.Numerics.Vector3;
/// <summary>
/// Michelangelo point-cloud perceiver encoder as configured by UniRig
/// (sal_perceiver.py, query_method=False): fourier-embedded points through an
/// input projection, one cross-attention block (sampled points ← all points),
/// a 16-layer pre-LN self-attention stack, and a post layer norm.
/// IMPORTANT: fused qkv/kv projections are PER-HEAD INTERLEAVED
/// (view(n, heads, 3·headDim).split) — de-interleaved here into the
/// head-major layout TransformerOps.Attention expects.
/// </summary>
public sealed class PerceiverSelfBlock
{
public required Tensor QkvWeight; // [3·width, width], NO bias (qkv_bias=False)
public required Tensor ProjWeight, ProjBias; // c_proj
public required Tensor Ln1Gamma, Ln1Beta;
public required Tensor Ln2Gamma, Ln2Beta;
public required Tensor FcWeight, FcBias; // c_fc (width·4)
public required Tensor Fc2Weight, Fc2Bias; // mlp c_proj
public Tensor Forward( Tensor x, int heads )
{
var normed = TransformerOps.LayerNorm( x, Ln1Gamma, Ln1Beta );
var qkv = normed.MatMul( QkvWeight.Transposed );
var (q, k, v) = PerceiverEncoder.SplitInterleaved( qkv, heads, 3 );
var attended = TransformerOps.Attention( q, k, v, heads, causal: false );
x = x.Add( attended.MatMul( ProjWeight.Transposed ).Add( ProjBias ) );
var mlpIn = TransformerOps.LayerNorm( x, Ln2Gamma, Ln2Beta );
var h = TransformerOps.Gelu( mlpIn.MatMul( FcWeight.Transposed ).Add( FcBias ) );
return x.Add( h.MatMul( Fc2Weight.Transposed ).Add( Fc2Bias ) );
}
}
public sealed class PerceiverCrossBlock
{
public required Tensor QWeight; // [width, width], no bias
public required Tensor KvWeight; // [2·width, dataWidth], no bias
public required Tensor ProjWeight, ProjBias;
public required Tensor Ln1Gamma, Ln1Beta; // on queries
public required Tensor Ln2Gamma, Ln2Beta; // on data
public required Tensor Ln3Gamma, Ln3Beta; // pre-MLP
public required Tensor FcWeight, FcBias, Fc2Weight, Fc2Bias;
public Tensor Forward( Tensor x, Tensor data, int heads )
{
var q = TransformerOps.LayerNorm( x, Ln1Gamma, Ln1Beta )
.MatMul( QWeight.Transposed );
var kv = TransformerOps.LayerNorm( data, Ln2Gamma, Ln2Beta )
.MatMul( KvWeight.Transposed );
var (k, v, _) = PerceiverEncoder.SplitInterleaved( kv, heads, 2 );
var attended = TransformerOps.Attention( q, k, v, heads, causal: false );
x = x.Add( attended.MatMul( ProjWeight.Transposed ).Add( ProjBias ) );
var mlpIn = TransformerOps.LayerNorm( x, Ln3Gamma, Ln3Beta );
var h = TransformerOps.Gelu( mlpIn.MatMul( FcWeight.Transposed ).Add( FcBias ) );
return x.Add( h.MatMul( Fc2Weight.Transposed ).Add( Fc2Bias ) );
}
}
public sealed class PerceiverEncoder
{
public const int NumFreqs = 8;
/// <summary>UniRig/SkinTokens use 8; MagicArticulate's ShapeVAE uses 12.</summary>
public int Heads { get; init; } = 8;
public required Tensor InputProjWeight, InputProjBias; // [width, 54]
public required PerceiverCrossBlock Cross;
public required PerceiverSelfBlock[] SelfBlocks;
public required Tensor LnPostGamma, LnPostBeta;
/// <summary>
/// Encodes an already-subsampled cloud: <paramref name="prePoints"/> = the
/// seeded 4096-point subset, <paramref name="sampledIndices"/> = fps picks
/// (1024) into it. Returns [1024, width] latents.
/// </summary>
public Tensor Encode( Vector3[] prePoints, Vector3[] preNormals, int[] sampledIndices )
{
var data = Project( prePoints, preNormals );
var sampledPoints = new Vector3[sampledIndices.Length];
var sampledNormals = new Vector3[sampledIndices.Length];
for ( var i = 0; i < sampledIndices.Length; i++ )
{
sampledPoints[i] = prePoints[sampledIndices[i]];
sampledNormals[i] = preNormals[sampledIndices[i]];
}
var query = Project( sampledPoints, sampledNormals );
var latents = Cross.Forward( query, data, Heads );
foreach ( var block in SelfBlocks )
latents = block.Forward( latents, Heads );
return TransformerOps.LayerNorm( latents, LnPostGamma, LnPostBeta );
}
/// <summary>input_proj(cat[fourier(points), normals]): fourier layout is
/// [x,y,z, sin(x·f0..f7), sin(y·..), sin(z·..), cos(...)] (51) + normals (3).</summary>
internal Tensor Project( Vector3[] points, Vector3[] normals )
{
const int fourierDim = 3 * (NumFreqs * 2 + 1);
var rows = points.Length;
var input = new float[rows * (fourierDim + 3)];
for ( var r = 0; r < rows; r++ )
{
var at = r * (fourierDim + 3);
var p = new[] { points[r].X, points[r].Y, points[r].Z };
input[at + 0] = p[0];
input[at + 1] = p[1];
input[at + 2] = p[2];
for ( var d = 0; d < 3; d++ )
for ( var f = 0; f < NumFreqs; f++ )
{
var scaled = p[d] * (1 << f); // 2^f, include_pi = False
input[at + 3 + d * NumFreqs + f] = MathF.Sin( scaled );
input[at + 3 + 24 + d * NumFreqs + f] = MathF.Cos( scaled );
}
input[at + fourierDim + 0] = normals[r].X;
input[at + fourierDim + 1] = normals[r].Y;
input[at + fourierDim + 2] = normals[r].Z;
}
return Tensor.From( input, rows, fourierDim + 3 )
.MatMul( InputProjWeight.Transposed ).Add( InputProjBias );
}
/// <summary>
/// De-interleaves a fused projection [n, groups·width] whose column layout is
/// per-head (head, group, headDim) — as produced by view(n, heads, groups·hd)
/// + split — into `groups` head-major [n, width] tensors (third slot unused
/// when groups is 2).
/// </summary>
internal static (Tensor A, Tensor B, Tensor C) SplitInterleaved(
Tensor fused, int heads, int groups )
{
var rows = fused.Shape[0];
var width = fused.Shape[1] / groups;
var headDim = width / heads;
var outputs = new float[3][];
for ( var g = 0; g < groups; g++ )
outputs[g] = new float[rows * width];
for ( var r = 0; r < rows; r++ )
for ( var h = 0; h < heads; h++ )
for ( var g = 0; g < groups; g++ )
Array.Copy(
fused.Data, r * groups * width + h * groups * headDim + g * headDim,
outputs[g], r * width + h * headDim,
headDim );
return (
Tensor.From( outputs[0], rows, width ),
Tensor.From( outputs[1], rows, width ),
groups > 2 ? Tensor.From( outputs[2], rows, width ) : null);
}
}