Code/SceneGaussianSplatSystem.cs
using Sandbox.Engine.Settings;
using Sandbox.Rendering;
using System.Runtime.InteropServices;

namespace Sandbox;

/// <summary>
/// A single-instance-per-scene-world renderer that collects all <see cref="SceneGaussianSplatObject"/> instances and draws them in one unified pipeline.
/// <br />
/// cull → sort → billboard → draw
/// <br />
/// This ensures correct global sort order across all splat clouds and avoids redundant GPU work (one sort pass instead of N).
/// </summary>
public class SceneGaussianSplatSystem : SceneCustomObject
{
	// Per-SceneWorld singleton map, avoids creating duplicate systems
	private static readonly Dictionary<SceneWorld, SceneGaussianSplatSystem> _instances = new();

	/// <summary>
	/// Get the existing system for this scene world, or create one if it doesn't exist.
	/// Also cleans up stale entries from destroyed SceneWorlds to prevent GPU memory leaks.
	/// </summary>
	public static SceneGaussianSplatSystem GetOrCreate( SceneWorld sceneWorld )
	{
		// Purge entries for SceneWorlds that no longer exist (eg. previous play sessions)
		// Each stale system holds GPU buffers that would otherwise leak.
		List<SceneWorld> staleKeys = null;
		foreach ( var kvp in _instances )
		{
			if ( !kvp.Key.IsValid() || !kvp.Value.IsValid() )
			{
				staleKeys ??= new();
				staleKeys.Add( kvp.Key );
			}
		}

		if ( staleKeys is not null )
		{
			foreach ( var key in staleKeys )
			{
				if ( _instances.Remove( key, out var stale ) )
				{
					stale.ReleaseGpuResources();
					stale.ForceFlushAllDisposals();
				}
			}
		}

		if ( _instances.TryGetValue( sceneWorld, out var existing ) && existing.IsValid() )
		{
			// Flush any expired disposals while we have a main-thread context
			existing.FlushPendingDisposals();
			return existing;
		}

		var system = new SceneGaussianSplatSystem( sceneWorld );
		_instances[sceneWorld] = system;
		return system;
	}

	// Compute shaders, shared across all instances (static)
	private static readonly ComputeShader _cullShader = new( "gaussian_splat_cull_cs" );
	private static readonly ComputeShader _argsShader = new( "gaussian_splat_args_cs" );
	private static readonly ComputeShader _depthShader = new( "gaussian_splat_depth_cs" );
	private static readonly ComputeShader _sortPrefixShader = new( "gaussian_splat_sort_prefix_cs" );
	private static readonly ComputeShader _sortScatterShader = new( "gaussian_splat_sort_scatter_cs" );
	private static readonly ComputeShader _computeShader = new( "gaussian_splat_cs" );
	private static readonly ComputeShader _chunkLodShader = new( "gaussian_splat_chunk_lod_cs" );
	private static readonly Material _defaultMaterial = Material.FromShader( "gaussian_splat" );

	private static readonly uint[] ZeroHistogram = new uint[65536];
	private static readonly uint[] ZeroCount = new uint[1];

	/// <summary>
	/// Stable slot allocation for splat objects. Each slot has a fixed buffer offset that
	/// never moves, so adding/removing objects doesn't shift existing data. Dead slots
	/// (Object=null) keep their buffer range reserved — the cull shader skips them via
	/// the inactive flag in ObjectDataGpu. This eliminates the O(total_scene_splats)
	/// full re-merge that previously occurred on every object add/remove.
	/// </summary>
	private readonly List<ObjectSlot> _slots = new();
	private readonly Dictionary<SceneGaussianSplatObject, int> _objectSlotIndex = new();

	/// <summary>
	/// Total splat buffer positions allocated across all slots (including dead/free ones).
	/// This is the effective "used length" of the splat buffers.
	/// </summary>
	private int _allocatedSplatCount;

	private struct ObjectSlot
	{
		public SceneGaussianSplatObject Object;
		public int BufferOffset;
		public int AllocatedCount;
		public bool IsFree;
		/// <summary>
		/// References to the data arrays last uploaded into this slot's scratch range.
		/// When a new occupant has the same references (shared cache), the scratch already
		/// contains correct data and the upload can be skipped entirely.
		/// </summary>
		public SceneGaussianSplatObject.SplatPosition[] LastPositionData;
		public SceneGaussianSplatObject.SplatData[] LastSplatData;
	}

	// Unified GPU buffers, sized to hold the merged splat data from all objects
	private GpuBuffer<SceneGaussianSplatObject.SplatPosition> _positionBuffer;
	private GpuBuffer<SceneGaussianSplatObject.SplatData> _splatDataBuffer;
	private GpuBuffer<uint> _objectIdBuffer;        // Per-splat object index
	private GpuBuffer<ObjectDataGpu> _objectDataBuffer; // Per-object transform + params combined

	// Chunked LOD buffers
	private GpuBuffer<uint> _chunkIdBuffer;          // Per-splat chunk index (uint16 packed into uint for structured buffer)
	private GpuBuffer<ChunkDataGpu> _chunkDataBuffer; // Per-chunk center + flags
	private GpuBuffer<float> _chunkLodBuffer;        // Per-chunk LOD factor (written by chunk LOD compute shader)
	private int _chunkCapacity;
	private int _chunkIdCapacity;
	private int _totalChunkCount;                    // Total chunks across all objects this frame

	private GpuBuffer<SplatVertex> _vertexBuffer;
	private GpuBuffer<uint> _indexBuffer;
	private GpuBuffer<uint> _sortKeysBuffer;
	private GpuBuffer<uint> _sortValuesBuffer;
	private GpuBuffer<uint> _histogramBuffer;
	private GpuBuffer<uint> _visibleIndicesBuffer;
	private GpuBuffer<uint> _visibleCountBuffer;
	private GpuBuffer<GpuBuffer.IndirectDispatchArguments> _indirectDispatchBuffer;
	private GpuBuffer<GpuBuffer.IndirectDrawIndexedArguments> _indirectDrawBuffer;
	private GpuBuffer<GaussianSplatVolume.VolumeGpuData> _volumeBuffer; // Add/Subtract volume descriptors
	private GpuBuffer<GaussianSplatVolume.VolumeGpuData> _colorVolumeBuffer; // Color volume descriptors (separate to avoid 32-bit mask limit)

	private int _splatCapacity;
	private int _vertexCapacity;
	private int _indexCapacity;
	private int _sortCapacity;
	private int _objectCapacity;
	private int _volumeCapacity;
	private int _colorVolumeCapacity;

	/// <summary>
	/// Registered boolean volumes that control splat visibility.
	/// </summary>
	private readonly List<GaussianSplatVolume> _volumes = new();

	/// <summary>
	/// Maps unique tag strings to bit indices (0..31) for GPU-side volume↔object matching.
	/// Rebuilt every frame from the tags referenced by active volumes. This lets us support
	/// unlimited volumes — the only limit is 32 unique tag strings, which is very generous.
	/// </summary>
	private readonly Dictionary<string, int> _tagBitRegistry = new();

	/// <summary>
	/// Shared render attributes passed to all compute shader dispatches and the final draw call.
	/// Rebuilt each frame in <see cref="RenderPipeline"/>.
	/// </summary>
	private readonly RenderAttributes _sharedAttrs = new();

	/// <summary>
	/// Set when buffer reallocation or compaction requires a full re-upload of all
	/// scratch data. Does NOT trigger a re-merge — scratch arrays are always valid.
	/// </summary>
	private bool _layoutDirty;

	/// <summary>
	/// Counts down the number of frames to skip compute after buffer reallocation or large
	/// data re-uploads. Multiple frames are skipped because the Vulkan pipeline may have
	/// 2-3 frames of GPU work in-flight — a single skip isn't enough to drain the pipeline
	/// before issuing heavy compute dispatches against new buffers.
	/// </summary>
	private int _skipComputeFrames;
	private int _framesWithoutObjects;

	/// <summary>
	/// Buffers queued for disposal with frame-delay to avoid destroying resources still
	/// referenced by in-flight GPU commands. Each entry tracks the frame it was queued on;
	/// actual disposal happens only after <see cref="DisposalDelayFrames"/> frames have passed.
	/// </summary>
	private readonly List<(IDisposable Resource, int QueuedFrame)> _pendingDisposals = new();
	private int _frameCounter;

	/// <summary>
	/// Number of frames to wait before disposing GPU buffers. Must be >= the Vulkan
	/// frame sync latency (typically 2-3) to ensure the GPU has fully drained all
	/// commands referencing the old buffer before it is destroyed.
	/// </summary>
	private const int DisposalDelayFrames = 4;

	/// <summary>
	/// Reusable scratch buffers for merging per-object data before a single GPU upload.
	/// Avoids relying on GpuBuffer.SetData offset parameter (which may not work correctly)
	/// and avoids allocating multi-MB arrays on every frame inside the render callback.
	/// </summary>
	private SceneGaussianSplatObject.SplatPosition[] _positionScratch = Array.Empty<SceneGaussianSplatObject.SplatPosition>();
	private SceneGaussianSplatObject.SplatData[] _dataScratch = Array.Empty<SceneGaussianSplatObject.SplatData>();
	private uint[] _objectIdScratch = Array.Empty<uint>();

	/// <summary>
	/// Reusable scratch arrays for per-frame GPU uploads, avoiding allocations every frame.
	/// </summary>
	private ObjectDataGpu[] _objectDataScratch = Array.Empty<ObjectDataGpu>();
	private GaussianSplatVolume.VolumeGpuData[] _volumeScratch = Array.Empty<GaussianSplatVolume.VolumeGpuData>();
	private readonly List<ChunkDataGpu> _chunkDataScratch = new();
	private GaussianSplatVolume.VolumeGpuData[] _colorVolumeScratch = Array.Empty<GaussianSplatVolume.VolumeGpuData>();
	private GaussianSplatVolume[] _activeVolumesScratch = Array.Empty<GaussianSplatVolume>();
	private GaussianSplatVolume[] _activeColorVolumesScratch = Array.Empty<GaussianSplatVolume>();

	/// <summary>
	/// GPU-side vertex output, must match the compute shader's SplatVertex and the VS VertexInput.
	/// </summary>
	[StructLayout( LayoutKind.Sequential )]
	private struct SplatVertex
	{
		[VertexLayout.Position] public Vector3 Position;
		[VertexLayout.Normal] public Vector3 Normal;
		[VertexLayout.Color] public Color Color;
		[VertexLayout.TexCoord] public Vector2 TexCoord;
	}

	/// <summary>
	/// Per-object data combining transform matrix and rendering parameters into one struct.
	/// Merged into a single GPU buffer to stay within the 16 storage buffer limit.
	/// Must match the shader's ObjectData struct layout exactly.
	/// </summary>
	[StructLayout( LayoutKind.Sequential )]
	private struct ObjectDataGpu
	{
		public Matrix Transform;
		public float SplatSize;
		public uint Flags;
		/// <summary>
		/// Bitmask of which tag groups this object belongs to (bit N = has tag N).
		/// Volumes carry their own include/exclude group bitmasks and the GPU
		/// evaluates the match per-volume, removing the 32-volume limit.
		/// </summary>
		public uint TagBits;
		/// <summary>
		/// Per-object tint color packed as RGBA8 (matches shader UnpackColor layout).
		/// </summary>
		public uint TintColor;
		/// <summary>
		/// Maximum LOD distance in world units. 0 = LOD disabled for this object.
		/// </summary>
		public float LODMaxDistance;
		/// <summary>
		/// Shadow tint color packed as RGBA8. Only used when Flags bit 3 (ReceiveShadows) is set.
		/// Lerps toward this color in shadowed regions.
		/// </summary>
		public uint ShadowTintColor;
		public float _lodPad1;
		public float _lodPad2;
		/// <summary>
		/// 8-sample LOD curve LUT: fraction of splats to keep at evenly spaced distances.
		/// Packed as two float4s for efficient GPU access.
		/// </summary>
		public Vector4 LODCurveLow;  // samples 0..3
		public Vector4 LODCurveHigh; // samples 4..7
	}

	/// <summary>
	/// GPU-side per-chunk data for the chunk LOD compute shader.
	/// </summary>
	[StructLayout( LayoutKind.Sequential )]
	private struct ChunkDataGpu
	{
		/// <summary>Local-space center of this chunk.</summary>
		public Vector3 Center;
		/// <summary>Object index this chunk belongs to (for transform lookup).</summary>
		public uint ObjectId;
		/// <summary>Packed flags: bit 0 = exempt from LOD.</summary>
		public uint Flags;
		public float _pad0;
		public float _pad1;
		public float _pad2;
	}

	public SceneGaussianSplatSystem( SceneWorld sceneWorld ) : base( sceneWorld )
	{
		Flags.IsOpaque = false;
		Flags.IsTranslucent = true;
	}

	/// <summary>
	/// Register a splat object to be rendered by this system.
	/// The object starts with SplatCount=0; a stable buffer slot is assigned later
	/// when data becomes available (DataChanged=true). No layout shift occurs.
	/// </summary>
	public void Register( SceneGaussianSplatObject obj )
	{
		FlushPendingDisposals();

		// Object is tracked but has no slot yet — slot assigned on first data upload
		if ( !_objectSlotIndex.ContainsKey( obj ) )
			_objectSlotIndex[obj] = -1; // sentinel: registered but no slot
	}

	/// <summary>
	/// Unregister a splat object from this system.
	/// The object's buffer slot is marked free (not removed), so no data shifts.
	/// The cull shader skips free slots via the inactive flag in ObjectDataGpu.
	/// </summary>
	public void Unregister( SceneGaussianSplatObject obj )
	{
		FlushPendingDisposals();

		if ( _objectSlotIndex.TryGetValue( obj, out int slotIdx ) )
		{
			_objectSlotIndex.Remove( obj );

			if ( slotIdx >= 0 && slotIdx < _slots.Count )
			{
				var slot = _slots[slotIdx];
				slot.Object = null;
				slot.IsFree = true;
				_slots[slotIdx] = slot;
			}
		}
		// No _layoutDirty — data stays in buffer, ObjectDataGpu Flags=4 next frame
	}

	/// <summary>
	/// Mark the layout as dirty, forcing a full re-upload of all scratch data next frame.
	/// Only needed after buffer reallocation or compaction — NOT for normal add/remove.
	/// </summary>
	public void MarkLayoutDirty()
	{
		_layoutDirty = true;
	}

	/// <summary>
	/// Pre-grow GPU buffers to accommodate additional splats without reallocation at spawn time.
	/// Call this during loading screens before instantiating splat prefabs at runtime.
	/// Only triggers the one-frame render skip if buffers actually need to grow.
	/// </summary>
	public void ReserveCapacity( int additionalSplats )
	{
		int needed = _allocatedSplatCount + additionalSplats;
		EnsureBufferCapacity( needed, _slots.Count + 1 );
	}

	/// <summary>
	/// Assign a stable buffer slot for an object that just loaded its data.
	/// Tries to reuse a free slot with an exact size match (common for pooled objects
	/// like blood particles that all use the same .sog file). Falls back to appending.
	/// </summary>
	private void AssignSlot( SceneGaussianSplatObject obj )
	{
		int count = obj.SplatCount;

		// Try to reuse a free slot with exact size match first (most efficient, no waste)
		for ( int i = 0; i < _slots.Count; i++ )
		{
			var slot = _slots[i];
			if ( slot.IsFree && slot.AllocatedCount == count )
			{
				slot.Object = obj;
				slot.IsFree = false;
				_slots[i] = slot;
				_objectSlotIndex[obj] = i;
				obj.DataChanged = true; // Force upload into this slot
				return;
			}
		}

		// No exact match — append a new slot at the end
		int newOffset = _allocatedSplatCount;
		_slots.Add( new ObjectSlot
		{
			Object = obj,
			BufferOffset = newOffset,
			AllocatedCount = count,
			IsFree = false
		} );
		_objectSlotIndex[obj] = _slots.Count - 1;
		_allocatedSplatCount += count;
		obj.DataChanged = true;
	}

	/// <summary>
	/// Register a boolean volume to affect splat visibility.
	/// </summary>
	public void RegisterVolume( GaussianSplatVolume vol )
	{
		if ( !_volumes.Contains( vol ) )
			_volumes.Add( vol );
	}

	/// <summary>
	/// Unregister a boolean volume.
	/// </summary>
	public void UnregisterVolume( GaussianSplatVolume vol )
	{
		_volumes.Remove( vol );
	}

	/// <summary>
	/// Total number of splats across all live (non-free) slots.
	/// </summary>
	public int TotalSplatCount
	{
		get
		{
			int total = 0;
			foreach ( var slot in _slots )
			{
				if ( !slot.IsFree && slot.Object != null && slot.Object.IsValid() )
					total += slot.Object.SplatCount;
			}
			return total;
		}
	}

	private void EnsureBufferCapacity( int totalSplats, int objectCount )
	{
		// Position + data + objectId buffers. Reallocation invalidates all splat data
		if ( totalSplats > _splatCapacity )
		{
			// Skip compute for multiple frames to let the GPU drain in-flight work
			// referencing old buffers. Only needed when replacing existing buffers —
			// first-time allocation has no old buffers in flight.
			bool hadExistingBuffers = _positionBuffer.IsValid();

			DeferDispose( _positionBuffer );
			DeferDispose( _splatDataBuffer );
			DeferDispose( _objectIdBuffer );
			DeferDispose( _visibleIndicesBuffer );
			_splatCapacity = (int)(totalSplats * 1.5);
			_positionBuffer = new GpuBuffer<SceneGaussianSplatObject.SplatPosition>( _splatCapacity, GpuBuffer.UsageFlags.Structured );
			_splatDataBuffer = new GpuBuffer<SceneGaussianSplatObject.SplatData>( _splatCapacity, GpuBuffer.UsageFlags.Structured );
			_objectIdBuffer = new GpuBuffer<uint>( _splatCapacity, GpuBuffer.UsageFlags.Structured );
			_visibleIndicesBuffer = new GpuBuffer<uint>( _splatCapacity, GpuBuffer.UsageFlags.Structured );

			if ( hadExistingBuffers )
				_skipComputeFrames = Math.Max( _skipComputeFrames, DisposalDelayFrames );
		}

		// Per-object data buffer (transform + params combined)
		if ( objectCount > _objectCapacity )
		{
			DeferDispose( _objectDataBuffer );
			_objectCapacity = Math.Max( objectCount * 2, 8 );
			_objectDataBuffer = new GpuBuffer<ObjectDataGpu>( _objectCapacity, GpuBuffer.UsageFlags.Structured );
		}

		// Vertex buffer (4 vertices per splat)
		int requiredVertices = totalSplats * 4;
		if ( requiredVertices > _vertexCapacity )
		{
			DeferDispose( _vertexBuffer );
			_vertexCapacity = (int)(requiredVertices * 1.5);
			_vertexBuffer = new GpuBuffer<SplatVertex>( _vertexCapacity, GpuBuffer.UsageFlags.Vertex | GpuBuffer.UsageFlags.Structured );
		}

		// Index buffer (6 indices per splat) — pre-generated with a static quad pattern.
		// Each quad `i` uses indices [i*4, i*4+1, i*4+2, i*4+2, i*4+1, i*4+3].
		// This never changes frame-to-frame, so we generate once on allocation and the
		// billboard shader doesn't need to write indices at all.
		int requiredIndices = totalSplats * 6;
		if ( requiredIndices > _indexCapacity )
		{
			DeferDispose( _indexBuffer );
			_indexCapacity = (int)(requiredIndices * 1.5);
			_indexBuffer = new GpuBuffer<uint>( _indexCapacity, GpuBuffer.UsageFlags.Index | GpuBuffer.UsageFlags.Structured );
			GenerateStaticIndexBuffer();
		}

		// Sort buffers
		if ( totalSplats > _sortCapacity )
		{
			DeferDispose( _sortKeysBuffer );
			DeferDispose( _sortValuesBuffer );
			_sortCapacity = (int)(totalSplats * 1.5);
			_sortKeysBuffer = new GpuBuffer<uint>( _sortCapacity, GpuBuffer.UsageFlags.Structured );
			_sortValuesBuffer = new GpuBuffer<uint>( _sortCapacity, GpuBuffer.UsageFlags.Structured );
		}

		// One-time allocations
		if ( !_histogramBuffer.IsValid() )
			_histogramBuffer = new GpuBuffer<uint>( 65536, GpuBuffer.UsageFlags.Structured );

		if ( !_visibleCountBuffer.IsValid() )
			_visibleCountBuffer = new GpuBuffer<uint>( 1, GpuBuffer.UsageFlags.Structured );

		if ( !_indirectDispatchBuffer.IsValid() )
			_indirectDispatchBuffer = new GpuBuffer<GpuBuffer.IndirectDispatchArguments>( 2,
				GpuBuffer.UsageFlags.Structured | GpuBuffer.UsageFlags.IndirectDrawArguments );

		if ( !_indirectDrawBuffer.IsValid() )
			_indirectDrawBuffer = new GpuBuffer<GpuBuffer.IndirectDrawIndexedArguments>( 1,
				GpuBuffer.UsageFlags.Structured | GpuBuffer.UsageFlags.IndirectDrawArguments );
	}

	/// <summary>
	/// Rebuild the tag bit registry from all tags referenced by active volumes this frame.
	/// Each unique tag string gets a bit index (0..31). Clears and repopulates every frame
	/// so bit assignments stay minimal and adapt to runtime volume changes.
	/// </summary>
	private void RebuildTagRegistry( List<GaussianSplatVolume> volumes )
	{
		_tagBitRegistry.Clear();

		foreach ( var vol in volumes )
		{
			if ( !vol.IsValid() || !vol.Active )
				continue;

			RegisterTags( vol.IncludeTags );
			RegisterTags( vol.ExcludeTags );
		}
	}

	private void RegisterTags( TagSet tags )
	{
		if ( tags is null ) return;

		foreach ( var tag in tags.TryGetAll() )
		{
			if ( _tagBitRegistry.Count >= 32 ) break; // Hard limit: 32 unique tags
			_tagBitRegistry.TryAdd( tag, _tagBitRegistry.Count );
		}
	}

	/// <summary>
	/// Compute a bitmask from a TagSet using the current tag registry.
	/// Each tag the set contains gets its bit set. Unknown tags (not in any volume) are ignored.
	/// </summary>
	private uint ComputeTagBits( TagSet tags )
	{
		if ( tags is null ) return 0;

		uint bits = 0;
		foreach ( var tag in tags.TryGetAll() )
		{
			if ( _tagBitRegistry.TryGetValue( tag, out int bitIndex ) )
				bits |= 1u << bitIndex;
		}
		return bits;
	}

	private void RenderPipeline()
	{
		// Assign slots to any objects that have loaded data but don't have a slot yet.
		// Also count live objects and detect lighting needs.
		int liveSlotCount = 0;
		bool anyLighting = false;

		// Collect objects that need slot (re)assignment — can't modify dictionary during iteration
		List<SceneGaussianSplatObject> needsSlot = null;

		foreach ( var kvp in _objectSlotIndex )
		{
			var obj = kvp.Key;
			if ( obj == null || !obj.IsValid() || obj.SplatCount == 0 )
				continue;

			int slotIdx = kvp.Value;

			if ( slotIdx < 0 )
			{
				// No slot yet — needs assignment
				needsSlot ??= new();
				needsSlot.Add( obj );
			}
			else if ( slotIdx < _slots.Count && _slots[slotIdx].AllocatedCount != obj.SplatCount )
			{
				// SplatCount changed — free old slot and reassign
				var oldSlot = _slots[slotIdx];
				oldSlot.Object = null;
				oldSlot.IsFree = true;
				_slots[slotIdx] = oldSlot;
				needsSlot ??= new();
				needsSlot.Add( obj );
			}

			liveSlotCount++;
			if ( obj.ReceiveLighting && obj.HasCovariance )
				anyLighting = true;
		}

		// Assign slots outside the dictionary iteration
		if ( needsSlot != null )
		{
			foreach ( var obj in needsSlot )
				AssignSlot( obj );
		}

		if ( _allocatedSplatCount == 0 || liveSlotCount == 0 )
			return;

		EnsureBufferCapacity( _allocatedSplatCount, _slots.Count );

		// Validate all buffers
		if ( !_positionBuffer.IsValid() || !_splatDataBuffer.IsValid() || !_objectIdBuffer.IsValid() )
			return;
		if ( !_vertexBuffer.IsValid() || !_indexBuffer.IsValid() )
			return;
		if ( !_sortKeysBuffer.IsValid() || !_sortValuesBuffer.IsValid() || !_histogramBuffer.IsValid() )
			return;
		if ( !_visibleIndicesBuffer.IsValid() || !_visibleCountBuffer.IsValid() )
			return;
		if ( !_indirectDispatchBuffer.IsValid() || !_indirectDrawBuffer.IsValid() )
			return;
		if ( !_objectDataBuffer.IsValid() )
			return;

		// Reuse pooled array for per-object GPU data (one entry per slot, including dead)
		int slotCount = _slots.Count;
		if ( _objectDataScratch.Length < slotCount )
			_objectDataScratch = new ObjectDataGpu[Math.Max( slotCount * 2, 8 )];
		var objectDataArray = _objectDataScratch;

		// Prune dead volumes, then split active volumes into visibility (Add/Subtract)
		// and Color lists. Color volumes are stored in a separate GPU buffer so they
		// don't consume bits in the tag-group bitmask, allowing unlimited Color volumes.
		for ( int i = _volumes.Count - 1; i >= 0; i-- )
		{
			if ( !_volumes[i].IsValid() )
				_volumes.RemoveAt( i );
		}

		int visibilityVolumeCount = 0;
		int colorVolumeCount = 0;
		int addVolumeCount = 0;

		foreach ( var vol in _volumes )
		{
			if ( !vol.Active ) continue;
			if ( vol.Mode == SplatVolumeMode.Color )
			{
				colorVolumeCount++;
			}
			else
			{
				visibilityVolumeCount++;
				if ( vol.Mode == SplatVolumeMode.Add )
					addVolumeCount++;
			}
		}

		// Reuse pooled arrays for active volume lists to avoid per-frame allocation
		if ( _activeVolumesScratch.Length < visibilityVolumeCount )
			_activeVolumesScratch = new GaussianSplatVolume[Math.Max( visibilityVolumeCount * 2, 8 )];
		if ( _activeColorVolumesScratch.Length < colorVolumeCount )
			_activeColorVolumesScratch = new GaussianSplatVolume[Math.Max( colorVolumeCount * 2, 8 )];
		var activeVolumes = visibilityVolumeCount > 0 ? _activeVolumesScratch : null;
		var activeColorVolumes = colorVolumeCount > 0 ? _activeColorVolumesScratch : null;

		if ( visibilityVolumeCount > 0 || colorVolumeCount > 0 )
		{
			int vi = 0, ci = 0;
			foreach ( var vol in _volumes )
			{
				if ( !vol.Active ) continue;
				if ( vol.Mode == SplatVolumeMode.Color )
					activeColorVolumes[ci++] = vol;
				else
					activeVolumes[vi++] = vol;
			}
		}

		// Rebuild the tag bit registry from all active volumes' include/exclude tags.
		// This maps each unique tag string to a bit index (0..31) so the GPU can
		// evaluate volume↔object matching without a per-volume bitmask limit.
		RebuildTagRegistry( _volumes );

		// Incremental splat data upload. Scratch arrays are maintained as a persistent
		// mirror of GPU buffer contents. Only objects with DataChanged get their range
		// patched — no full re-merge of all objects on every add/remove.
		bool forceUpload = _layoutDirty || _skipComputeFrames > 0;
		_layoutDirty = false;

		// Only skip compute during buffer reallocation cooldown.
		bool skipCompute = _skipComputeFrames > 0;
		if ( _skipComputeFrames > 0 )
			_skipComputeFrames--;

		// Grow scratch arrays to match allocated buffer size.
		// Array.Resize preserves existing data — critical because only objects with
		// DataChanged get re-patched; existing objects' data must survive the resize.
		if ( _positionScratch.Length < _allocatedSplatCount )
			Array.Resize( ref _positionScratch, _allocatedSplatCount );
		if ( _dataScratch.Length < _allocatedSplatCount )
			Array.Resize( ref _dataScratch, _allocatedSplatCount );
		if ( _objectIdScratch.Length < _allocatedSplatCount )
			Array.Resize( ref _objectIdScratch, _allocatedSplatCount );

		// Patch only objects whose data changed into their stable scratch positions
		bool anyPatched = false;
		for ( int i = 0; i < _slots.Count; i++ )
		{
			var slot = _slots[i];
			if ( slot.IsFree || slot.Object == null || !slot.Object.IsValid() )
				continue;

			var obj = slot.Object;
			if ( !obj.DataChanged && !forceUpload )
				continue;

			int count = obj.SplatCount;
			int offset = slot.BufferOffset;

			// Skip upload if this slot already has the correct data in scratch.
			// Blood particles (and other pooled objects) all share the same cached arrays,
			// so when a slot is reused by the same .sog file, the data is already there.
			if ( obj.PositionData == slot.LastPositionData
				&& obj.SplatDataArray == slot.LastSplatData
				&& !forceUpload )
			{
				obj.DataChanged = false;
				continue;
			}

			Array.Copy( obj.PositionData, 0, _positionScratch, offset, count );
			Array.Copy( obj.SplatDataArray, 0, _dataScratch, offset, count );
			Array.Fill( _objectIdScratch, (uint)i, offset, count );

			// Track what's in scratch so future occupants can skip upload
			slot.LastPositionData = obj.PositionData;
			slot.LastSplatData = obj.SplatDataArray;
			_slots[i] = slot;

			obj.DataChanged = false;
			anyPatched = true;
		}

		// Upload splat buffers only if something actually changed
		if ( anyPatched || forceUpload )
		{
			_positionBuffer.SetData( _positionScratch.AsSpan( 0, _allocatedSplatCount ) );
			_splatDataBuffer.SetData( _dataScratch.AsSpan( 0, _allocatedSplatCount ) );
			_objectIdBuffer.SetData( _objectIdScratch.AsSpan( 0, _allocatedSplatCount ) );
		}

		// Upload chunk data only when splat data actually changed — chunk assignments
		// are stable per-object and don't need rebuilding in steady state.
		if ( anyPatched || forceUpload )
		{
			bool anyChunkedLOD = false;
			_chunkDataScratch.Clear();
			for ( int i = 0; i < _slots.Count; i++ )
			{
				var slot = _slots[i];
				if ( slot.IsFree || slot.Object == null || !slot.Object.IsValid() )
					continue;

				var obj = slot.Object;
				if ( obj.EnableChunkedLOD && obj.ChunkIds is not null && obj.ChunkCount > 0 )
				{
					anyChunkedLOD = true;
					for ( int c = 0; c < obj.ChunkCount; c++ )
					{
						_chunkDataScratch.Add( new ChunkDataGpu
						{
							Center = obj.ChunkCenters[c],
							ObjectId = (uint)i,
							Flags = obj.ChunkExempt[c] ? 1u : 0u
						} );
					}
				}
			}

			_totalChunkCount = _chunkDataScratch.Count;
			if ( anyChunkedLOD && _totalChunkCount > 0 )
			{
				UploadChunkData( _allocatedSplatCount, _chunkDataScratch );
			}
			else
			{
				_totalChunkCount = 0;
			}
		}

		// Build ObjectDataGpu for ALL slots (live get real data, dead get inactive flag).
		// Indexed by slot index — matches the objectId stored per-splat in _objectIdScratch.
		for ( int i = 0; i < _slots.Count; i++ )
		{
			var slot = _slots[i];
			if ( slot.IsFree || slot.Object == null || !slot.Object.IsValid() || slot.Object.SplatCount == 0 )
			{
				// Dead/free slot: cull shader will early-exit on Flags bit 2
				objectDataArray[i] = new ObjectDataGpu { Flags = 4u };
				continue;
			}

			var obj = slot.Object;
			var tx = obj.Transform;
			var matrix = (Matrix.CreateScale( new Vector3( tx.Scale ) )
				* Matrix.CreateRotation( tx.Rotation )
				* Matrix.CreateTranslation( tx.Position )).Transpose();

			uint tagBits = ComputeTagBits( obj.Tags );
			var lodSamples = obj.LODCurveSamples;

			objectDataArray[i] = new ObjectDataGpu
			{
				Transform = matrix,
				SplatSize = obj.SplatSize,
				Flags = (obj.HasCovariance ? 1u : 0u)
					| (obj.ReceiveLighting && obj.HasCovariance ? 2u : 0u)
					| (!obj.IsActive ? 4u : 0u)
					| (obj.ReceiveShadows && !obj.ReceiveLighting ? 8u : 0u),
				TagBits = tagBits,
				TintColor = PackColorRGBA8( obj.Tint ),
				LODMaxDistance = obj.LODMaxDistance,
				ShadowTintColor = PackColorRGBA8( obj.ShadowTint ),
				LODCurveLow = new Vector4( lodSamples[0], lodSamples[1], lodSamples[2], lodSamples[3] ),
				LODCurveHigh = new Vector4( lodSamples[4], lodSamples[5], lodSamples[6], lodSamples[7] )
			};
		}

		// Upload per-object data every frame (objects may move)
		_objectDataBuffer.SetData( objectDataArray.AsSpan( 0, slotCount ) );

		// Upload Add/Subtract volume data (volumes may move/change every frame).
		// Each volume's GroupFilter is populated here from the tag registry so that
		// the GPU can evaluate include/exclude tag matching per-volume.
		if ( visibilityVolumeCount > 0 )
		{
			if ( visibilityVolumeCount > _volumeCapacity )
			{
				DeferDispose( _volumeBuffer );
				_volumeCapacity = Math.Max( visibilityVolumeCount * 2, 8 );
				_volumeBuffer = new GpuBuffer<GaussianSplatVolume.VolumeGpuData>( _volumeCapacity, GpuBuffer.UsageFlags.Structured );
			}

			if ( _volumeScratch.Length < visibilityVolumeCount )
				_volumeScratch = new GaussianSplatVolume.VolumeGpuData[Math.Max( visibilityVolumeCount * 2, 8 )];
			var volumeArray = _volumeScratch;
			for ( int i = 0; i < visibilityVolumeCount; i++ )
			{
				var vol = activeVolumes[i];
				var data = vol.BuildGpuData();
				data.GroupFilter = new Vector4(
					BitConverter.UInt32BitsToSingle( ComputeTagBits( vol.IncludeTags ) ),
					BitConverter.UInt32BitsToSingle( ComputeTagBits( vol.ExcludeTags ) ),
					0f, 0f
				);
				volumeArray[i] = data;
			}

			_volumeBuffer.SetData( volumeArray.AsSpan( 0, visibilityVolumeCount ) );
		}

		// Upload Color volume data into a separate buffer (unlimited count).
		// Color volumes also carry GroupFilter for tag-based filtering.
		if ( colorVolumeCount > 0 )
		{
			if ( colorVolumeCount > _colorVolumeCapacity )
			{
				DeferDispose( _colorVolumeBuffer );
				_colorVolumeCapacity = Math.Max( colorVolumeCount * 2, 8 );
				_colorVolumeBuffer = new GpuBuffer<GaussianSplatVolume.VolumeGpuData>( _colorVolumeCapacity, GpuBuffer.UsageFlags.Structured );
			}

			if ( _colorVolumeScratch.Length < colorVolumeCount )
				_colorVolumeScratch = new GaussianSplatVolume.VolumeGpuData[Math.Max( colorVolumeCount * 2, 8 )];
			var colorArray = _colorVolumeScratch;
			for ( int i = 0; i < colorVolumeCount; i++ )
			{
				var vol = activeColorVolumes[i];
				var data = vol.BuildGpuData();
				data.GroupFilter = new Vector4(
					BitConverter.UInt32BitsToSingle( ComputeTagBits( vol.IncludeTags ) ),
					BitConverter.UInt32BitsToSingle( ComputeTagBits( vol.ExcludeTags ) ),
					0f, 0f
				);
				colorArray[i] = data;
			}

			_colorVolumeBuffer.SetData( colorArray.AsSpan( 0, colorVolumeCount ) );
		}

		// Skip the compute pipeline when buffers were just (re)allocated or a large data
		// re-upload occurred. Stacking massive CPU→GPU transfers with compute dispatches
		// in the same frame exceeds the Vulkan fence timeout (250ms) and causes swap chain
		// present stalls. The pipeline will resume once the cooldown expires.
		if ( skipCompute )
			return;

		// Map video quality settings to shader attributes.
		// High = current visual fidelity, lower settings trade quality for performance.
		var (splatDensity, shadowQuality, lowPassFilter) = GetQualitySettings();

		// Build shared render attributes for all compute dispatches
		_sharedAttrs.Set( "SplatDensity", splatDensity );
		_sharedAttrs.Set( "SplatShadowQuality", shadowQuality );
		_sharedAttrs.Set( "SplatLowPassFilter", lowPassFilter );
		_sharedAttrs.Set( "SplatPositions", (GpuBuffer)_positionBuffer );
		_sharedAttrs.Set( "SplatObjectIds", (GpuBuffer)_objectIdBuffer );
		_sharedAttrs.Set( "ObjectData", (GpuBuffer)_objectDataBuffer );
		_sharedAttrs.Set( "VisibleIndices", (GpuBuffer)_visibleIndicesBuffer );
		_sharedAttrs.Set( "VisibleCount", (GpuBuffer)_visibleCountBuffer );
		_sharedAttrs.Set( "SplatCount", _allocatedSplatCount );
		_sharedAttrs.Set( "TotalSplatCount", _allocatedSplatCount );
		_sharedAttrs.Set( "SortKeys", (GpuBuffer)_sortKeysBuffer );
		_sharedAttrs.Set( "Histogram", (GpuBuffer)_histogramBuffer );
		_sharedAttrs.Set( "IndirectDispatch", (GpuBuffer)_indirectDispatchBuffer );
		_sharedAttrs.Set( "IndirectDraw", (GpuBuffer)_indirectDrawBuffer );
		_sharedAttrs.Set( "SortOutput", (GpuBuffer)_sortValuesBuffer );
		_sharedAttrs.Set( "SortIndices", (GpuBuffer)_sortValuesBuffer );
		_sharedAttrs.Set( "SplatDataBuffer", (GpuBuffer)_splatDataBuffer );
		_sharedAttrs.Set( "VertexBuffer", (GpuBuffer)_vertexBuffer );

		// Volume attributes — always set counts (shaders check VolumeCount == 0 to skip)
		_sharedAttrs.Set( "VolumeCount", visibilityVolumeCount );
		_sharedAttrs.Set( "AddVolumeCount", addVolumeCount );
		if ( visibilityVolumeCount > 0 && _volumeBuffer.IsValid() )
			_sharedAttrs.Set( "Volumes", (GpuBuffer)_volumeBuffer );

		// Color volume attributes — separate buffer, no bitmask, unlimited count
		_sharedAttrs.Set( "ColorVolumeCount", colorVolumeCount );
		if ( colorVolumeCount > 0 && _colorVolumeBuffer.IsValid() )
			_sharedAttrs.Set( "ColorVolumes", (GpuBuffer)_colorVolumeBuffer );

		// Chunked LOD attributes
		bool hasChunkedLOD = _totalChunkCount > 0 && _chunkIdBuffer.IsValid() && _chunkDataBuffer.IsValid() && _chunkLodBuffer.IsValid();
		_sharedAttrs.Set( "ChunkCount", hasChunkedLOD ? _totalChunkCount : 0 );
		if ( hasChunkedLOD )
		{
			_sharedAttrs.Set( "ChunkIds", (GpuBuffer)_chunkIdBuffer );
			_sharedAttrs.Set( "ChunkData", (GpuBuffer)_chunkDataBuffer );
			_sharedAttrs.Set( "ChunkLOD", (GpuBuffer)_chunkLodBuffer );
		}

		// --- Phase -1: Compute per-chunk LOD factors (if any objects use chunked LOD) ---
		if ( hasChunkedLOD )
		{
			_chunkLodShader.DispatchWithAttributes( _sharedAttrs, _totalChunkCount, 1, 1 );
			Graphics.ResourceBarrierTransition( (GpuBuffer)_chunkLodBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );
		}

		// --- Phase 0: Frustum cull all splats globally ---
		// Clear VisibleCount via resource barrier + SetData. This is a 4-byte upload
		// so the implicit sync cost is negligible.
		Graphics.ResourceBarrierTransition( (GpuBuffer)_visibleCountBuffer, ResourceState.UnorderedAccess, ResourceState.CopyDestination );
		_visibleCountBuffer.SetData( ZeroCount.AsSpan() );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_visibleCountBuffer, ResourceState.CopyDestination, ResourceState.UnorderedAccess );
		_cullShader.DispatchWithAttributes( _sharedAttrs, _allocatedSplatCount, 1, 1 );

		// UAV barriers
		Graphics.ResourceBarrierTransition( (GpuBuffer)_visibleIndicesBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_visibleCountBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );

		// --- Phase 0.5: Setup indirect args ---
		_argsShader.DispatchWithAttributes( _sharedAttrs, 1, 1, 1 );

		Graphics.ResourceBarrierTransition( (GpuBuffer)_indirectDispatchBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_indirectDrawBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );

		// --- Counting sort (depth + prefix + scatter) ---
		// Clear histogram via barrier + SetData. The barrier ensures the GPU is done reading
		// the histogram from the previous frame's scatter pass before we overwrite it.
		Graphics.ResourceBarrierTransition( (GpuBuffer)_histogramBuffer, ResourceState.UnorderedAccess, ResourceState.CopyDestination );
		_histogramBuffer.SetData( ZeroHistogram.AsSpan() );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_histogramBuffer, ResourceState.CopyDestination, ResourceState.UnorderedAccess );
		_depthShader.DispatchIndirectWithAttributes( _sharedAttrs, (GpuBuffer)_indirectDispatchBuffer, 0 );

		Graphics.ResourceBarrierTransition( (GpuBuffer)_sortKeysBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_histogramBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );

		_sortPrefixShader.DispatchWithAttributes( _sharedAttrs, 256, 1, 1 );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_histogramBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );

		_sortScatterShader.DispatchIndirectWithAttributes( _sharedAttrs, (GpuBuffer)_indirectDispatchBuffer, 0 );
		Graphics.ResourceBarrierTransition( (GpuBuffer)_sortValuesBuffer, ResourceState.UnorderedAccess, ResourceState.UnorderedAccess );

		// --- Phase 4: Billboard compute ---
		// Set up per-object lighting data (writes light cluster/shadow info to attributes)
		if ( anyLighting )
		{
			Graphics.SetupLighting( this, _sharedAttrs );
		}

		// Dispatch for ALL splats — the shader writes degenerate zero-area quads for non-visible
		// slots so the direct draw doesn't render stale/garbage vertices. Index buffer is static
		// (pre-generated), so the shader only writes vertex data.
		_computeShader.DispatchWithAttributes( _sharedAttrs, _allocatedSplatCount, 1, 1 );

		// Transition vertex buffer for draw
		Graphics.ResourceBarrierTransition( (GpuBuffer)_vertexBuffer, ResourceState.UnorderedAccess, ResourceState.VertexOrIndexBuffer );

		// Draw all splats — non-visible slots have degenerate (zero-area) quads that the GPU
		// culls for free at triangle setup. The pre-generated index buffer avoids per-frame writes.
		Graphics.Draw( _vertexBuffer, (GpuBuffer)_indexBuffer, _defaultMaterial, 0, _allocatedSplatCount * 6, Attributes );
	}

	public override void RenderSceneObject()
	{
		base.RenderSceneObject();

		// Tick the frame counter for disposal delay tracking.
		// Actual disposal happens on the main thread via Register/Unregister/GetOrCreate,
		// NOT here — RenderSceneObject runs on the render thread and GpuBuffer.Dispose
		// requires the main thread.
		_frameCounter++;

		// Prune dead objects that were never properly unregistered (eg. GC'd components).
		// Mark their slots as free — no layout shift needed.
		List<SceneGaussianSplatObject> deadKeys = null;
		foreach ( var kvp in _objectSlotIndex )
		{
			if ( !kvp.Key.IsValid() )
			{
				deadKeys ??= new();
				deadKeys.Add( kvp.Key );
			}
		}
		if ( deadKeys != null )
		{
			foreach ( var key in deadKeys )
			{
				if ( _objectSlotIndex.TryGetValue( key, out int slotIdx ) )
				{
					_objectSlotIndex.Remove( key );
					if ( slotIdx >= 0 && slotIdx < _slots.Count )
					{
						var slot = _slots[slotIdx];
						slot.Object = null;
						slot.IsFree = true;
						_slots[slotIdx] = slot;
					}
				}
			}
		}

		// Check if any live objects exist (either in assigned slots or awaiting slot assignment)
		bool anyLive = false;
		foreach ( var slot in _slots )
		{
			if ( !slot.IsFree && slot.Object != null && slot.Object.IsValid() )
			{
				anyLive = true;
				break;
			}
		}
		if ( !anyLive )
		{
			// Also check for registered objects awaiting their first slot assignment
			foreach ( var kvp in _objectSlotIndex )
			{
				if ( kvp.Key.IsValid() && kvp.Key.SplatCount > 0 )
				{
					anyLive = true;
					break;
				}
			}
		}

		if ( !anyLive )
		{
			// Release GPU memory when idle for a short while to avoid carrying huge transient buffers
			// after objects are deleted/disabled in editor workflows.
			_framesWithoutObjects++;
			if ( _framesWithoutObjects > 30 )
				ReleaseGpuResources();
			return;
		}

		_framesWithoutObjects = 0;

		RenderPipeline();
	}

	/// <summary>
	/// Clean up all GPU resources owned by the system.
	/// </summary>
	public void DestroySystem()
	{
		// Remove from singleton map
		foreach ( var kvp in _instances )
		{
			if ( kvp.Value == this )
			{
				_instances.Remove( kvp.Key );
				break;
			}
		}

		_slots.Clear();
		_objectSlotIndex.Clear();
		_allocatedSplatCount = 0;
		ReleaseGpuResources();
		ForceFlushAllDisposals();
	}

	/// <summary>
	/// Release all GPU buffers and reset capacity tracking. Safe to call multiple times.
	/// </summary>
	private void ReleaseGpuResources()
	{
		DeferDispose( _positionBuffer );
		DeferDispose( _splatDataBuffer );
		DeferDispose( _objectIdBuffer );
		DeferDispose( _objectDataBuffer );
		DeferDispose( _vertexBuffer );
		DeferDispose( _indexBuffer );
		DeferDispose( _sortKeysBuffer );
		DeferDispose( _sortValuesBuffer );
		DeferDispose( _histogramBuffer );
		DeferDispose( _visibleIndicesBuffer );
		DeferDispose( _visibleCountBuffer );
		DeferDispose( _indirectDispatchBuffer );
		DeferDispose( _indirectDrawBuffer );
		DeferDispose( _volumeBuffer );
		DeferDispose( _colorVolumeBuffer );
		DeferDispose( _chunkIdBuffer );
		DeferDispose( _chunkDataBuffer );
		DeferDispose( _chunkLodBuffer );

		_positionBuffer = default;
		_splatDataBuffer = default;
		_objectIdBuffer = default;
		_objectDataBuffer = default;
		_vertexBuffer = default;
		_indexBuffer = default;
		_sortKeysBuffer = default;
		_sortValuesBuffer = default;
		_histogramBuffer = default;
		_visibleIndicesBuffer = default;
		_visibleCountBuffer = default;
		_indirectDispatchBuffer = default;
		_indirectDrawBuffer = default;
		_volumeBuffer = default;
		_colorVolumeBuffer = default;
		_chunkIdBuffer = default;
		_chunkDataBuffer = default;
		_chunkLodBuffer = default;
		_splatCapacity = 0;
		_vertexCapacity = 0;
		_indexCapacity = 0;
		_sortCapacity = 0;
		_objectCapacity = 0;
		_volumeCapacity = 0;
		_colorVolumeCapacity = 0;
		_chunkCapacity = 0;
		_chunkIdCapacity = 0;
		_totalChunkCount = 0;
		_objectIdScratch = Array.Empty<uint>();
		_positionScratch = Array.Empty<SceneGaussianSplatObject.SplatPosition>();
		_dataScratch = Array.Empty<SceneGaussianSplatObject.SplatData>();
		_slots.Clear();
		_objectSlotIndex.Clear();
		_allocatedSplatCount = 0;
	}

	/// <summary>
	/// Fill the index buffer with a static quad pattern: [i*4, i*4+1, i*4+2, i*4+2, i*4+1, i*4+3]
	/// for each quad slot. Called once when the index buffer is allocated/grown.
	/// Eliminates per-frame index writes from the billboard shader entirely.
	/// </summary>
	private void GenerateStaticIndexBuffer()
	{
		int quadCount = _indexCapacity / 6;
		var indices = new uint[_indexCapacity];
		for ( int i = 0; i < quadCount; i++ )
		{
			uint vBase = (uint)(i * 4);
			int idx = i * 6;
			indices[idx + 0] = vBase;
			indices[idx + 1] = vBase + 1;
			indices[idx + 2] = vBase + 2;
			indices[idx + 3] = vBase + 2;
			indices[idx + 4] = vBase + 1;
			indices[idx + 5] = vBase + 3;
		}
		_indexBuffer.SetData( indices.AsSpan() );
	}

	/// <summary>
	/// Upload per-splat chunk IDs and per-chunk data buffers for chunked LOD.
	/// </summary>
	private void UploadChunkData( int totalSplats, List<ChunkDataGpu> chunkDataList )
	{
		// Grow chunk ID buffer if needed
		if ( totalSplats > _chunkIdCapacity )
		{
			DeferDispose( _chunkIdBuffer );
			_chunkIdCapacity = (int)(totalSplats * 1.5);
			_chunkIdBuffer = new GpuBuffer<uint>( _chunkIdCapacity, GpuBuffer.UsageFlags.Structured );
		}

		// Grow chunk data/LOD buffers if needed
		int chunkCount = chunkDataList.Count;
		if ( chunkCount > _chunkCapacity )
		{
			DeferDispose( _chunkDataBuffer );
			DeferDispose( _chunkLodBuffer );
			_chunkCapacity = Math.Max( (int)(chunkCount * 1.5), 64 );
			_chunkDataBuffer = new GpuBuffer<ChunkDataGpu>( _chunkCapacity, GpuBuffer.UsageFlags.Structured );
			_chunkLodBuffer = new GpuBuffer<float>( _chunkCapacity, GpuBuffer.UsageFlags.Structured );
		}

		// Build merged chunk ID array — each splat gets a global chunk index.
		// Objects without chunked LOD get chunk ID = 0xFFFFFFFF (sentinel for "use per-object LOD").
		var chunkIds = new uint[totalSplats];
		Array.Fill( chunkIds, 0xFFFFFFFFu );

		int globalChunkOffset = 0;
		foreach ( var slot in _slots )
		{
			if ( slot.IsFree || slot.Object == null || !slot.Object.IsValid() || slot.Object.SplatCount == 0 )
				continue;

			var obj = slot.Object;
			int offset = slot.BufferOffset;

			if ( obj.EnableChunkedLOD && obj.ChunkIds is not null && obj.ChunkCount > 0 )
			{
				for ( int i = 0; i < obj.SplatCount; i++ )
				{
					chunkIds[offset + i] = (uint)(globalChunkOffset + obj.ChunkIds[i]);
				}
				globalChunkOffset += obj.ChunkCount;
			}
		}

		_chunkIdBuffer.SetData( chunkIds.AsSpan( 0, totalSplats ) );
		_chunkDataBuffer.SetData( chunkDataList.ToArray().AsSpan() );
	}

	/// <summary>
	/// Pack a Color into a uint matching the shader's UnpackColor layout (RGBA8).
	/// </summary>
	private static uint PackColorRGBA8( Color c )
	{
		uint r = (uint)(c.r.Clamp( 0f, 1f ) * 255f);
		uint g = (uint)(c.g.Clamp( 0f, 1f ) * 255f);
		uint b = (uint)(c.b.Clamp( 0f, 1f ) * 255f);
		uint a = (uint)(c.a.Clamp( 0f, 1f ) * 255f);
		return r | (g << 8) | (b << 16) | (a << 24);
	}

	/// <summary>
	/// Map the game's video quality settings to shader-consumable values.
	/// At High settings this returns (1.0, 2, 1) — identical to pre-quality behavior.
	/// Reads ConVars directly so it works in both standalone and editor contexts.
	/// </summary>
	private static (float SplatDensity, int ShadowQuality, int LowPassFilter) GetQualitySettings()
	{
		// TextureQuality → splat density (fewer splats at lower detail)
		// r_max_anisotropy: 1=Low, 2=Medium, 4=High (set by quality_profiles.json)
		int aniso = int.Parse( ConsoleSystem.GetValue( "r_max_anisotropy", "4" ) );
		float density = aniso switch
		{
			<= 1 => 0.5f,
			<= 2 => 0.75f,
			_ => 1.0f
		};

		// ShadowQuality → per-splat shadow evaluation
		// r.shadows.quality: 1=Low, 2=Medium, 3=High (set by quality_profiles.json)
		int shadowConVar = int.Parse( ConsoleSystem.GetValue( "r.shadows.quality", "3" ) );
		int shadows = shadowConVar switch
		{
			<= 1 => 0,   // Low: no per-splat shadows
			2 => 1,       // Medium: sun shadow only
			_ => 2        // High: all light shadows
		};

		// AntiAliasing → low-pass filter (prevents sub-pixel flicker)
		// When MSAA is disabled or AA quality is minimal, skip the low-pass filter.
		// r_ao_quality serves as a proxy: 1=Low postprocess, meaning minimal AA too.
		int aoQuality = int.Parse( ConsoleSystem.GetValue( "r_ao_quality", "3" ) );
		int lowPass = aoQuality <= 1 ? 0 : 1;

		return (density, shadows, lowPass);
	}

	/// <summary>
	/// Queue a buffer for deferred disposal. Tracks the current frame so the buffer
	/// won't actually be disposed until <see cref="DisposalDelayFrames"/> frames have
	/// passed, giving the GPU time to finish any in-flight commands that reference it.
	/// </summary>
	private void DeferDispose( IDisposable disposable )
	{
		if ( disposable != null )
			_pendingDisposals.Add( (disposable, _frameCounter) );
	}

	/// <summary>
	/// Dispose only those queued buffers whose delay has expired.
	/// Must be called from the main thread. Called from Register/Unregister/GetOrCreate
	/// and from <see cref="GaussianSplatRenderer.OnPreRender"/> to ensure regular flushing
	/// even when no objects are registering/unregistering.
	/// </summary>
	internal void FlushPendingDisposals()
	{
		if ( _pendingDisposals.Count == 0 )
			return;

		for ( int i = _pendingDisposals.Count - 1; i >= 0; i-- )
		{
			var (resource, queuedFrame) = _pendingDisposals[i];
			if ( _frameCounter - queuedFrame >= DisposalDelayFrames )
			{
				resource.Dispose();
				_pendingDisposals.RemoveAt( i );
			}
		}
	}

	/// <summary>
	/// Force-dispose ALL pending buffers immediately, ignoring frame delay.
	/// Only safe during full system teardown when no GPU work is in-flight.
	/// </summary>
	private void ForceFlushAllDisposals()
	{
		foreach ( var (resource, _) in _pendingDisposals )
			resource.Dispose();

		_pendingDisposals.Clear();
	}
}