while
loop. WebAssembly allows you to jump around in a much more flexible fashion. It's still limited to structured control flow between blocks, but that only helps so much.(local_1 + local_2) / 2 // type f32 global_1 - 64 // type i32 memory[memory[local_1] * 8] + 3 // type i64 (the inner expression is type i32)
local_3 = (local_1 + local_2) / 2 // type f32 global_1 = global_1 - 64 // type i32 memory[local_0] = memory[memory[local_1] * 8] + 3 // type i64 (the inner expression is type i32) call foo
D0
...DF
, Num
, and Neg
structs similar to the BrainFlood article.SetR3_F32< Op_F32_Div< Op_F32_Add<GetR1_F32,GetR2_F32>, Const_F32< /* 2 */> > // Globals use constant indices. SetGlobal_I32<D1, Op_I32_Sub< GetGlobal_I32<D1>, Const_I32< /* 64 */> > > // Memory operations have a constant offset as their last parameter. Memory_I64_Store< Op_I64_Add< Memory_I64_Load< Op_I32_Mul< Memory_I32_Load< GetR1_I32, D0 >, Const_I32< /* 8 */> >, D0 >, Const_I64< /* 3 */> > , GetR0_I32, D0 > // Calls will be explained in more depth later. StaticCall<D8,DC>
// Example expression definition interface Expr<T> { T Run( ref Registers reg, Span<long> frame, WasmInstance inst ); } struct Op_I64_Add<A, B> : Expr<long> where A : struct, Expr<long> where B : struct, Expr<long> { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public long Run( ref Registers reg, Span<long> frame, WasmInstance inst ) => default( A ).Run( ref reg, frame, inst ) + default( B ).Run( ref reg, frame, inst ); } // Example statement definition interface Stmt { void Run( ref Registers reg, Span<long> frame, WasmInstance inst ); } struct SetGlobal_I32<INDEX, VALUE> : Stmt where INDEX : struct, Const where VALUE : struct, Expr<int> { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { inst.Globals[(int)default( INDEX ).Run()] = (uint)default( VALUE ).Run( ref reg, frame, inst ); } }
F12 = I32_Add(F21,I32_Mul(F20,24)) F24 = M_I64[F12 + 16] F15 = M_I32[F12 + 12] F25 = M_I32[F12 + 8] F26 = M_I32[F12 + 4] F27 = M_I32_I8_U[F12 + 1] F23 = M_I32_I8_U[F12] M_I64[F1 + 112] = M_I64[F16 + 4] M_I32[F1 + 108] = F22 F35 = F3 F36 = I32_Add(F1,108) call _ZN4core4hash11BuildHasher8hash_one17hab058952bc790a7aE[34] F11 = F34 F28 = M_I32[F1 + 4] F12 = I32_Wrap_I64(F11) F21 = I32_And(F28,F12) F29 = I64_ShiftRight_U(F11,25) F10 = I64_Mul(I64_And(F29,127),72340172838076673) F30 = 0 F20 = M_I32[F1 + 116] F22 = M_I32[F1 + 112] F16 = M_I32[F1]
struct Registers { public long R0; public long R1; public long R2; public long R3; public long R4; public long R5; public long R6; public int NextBlock; } // Sample f32 register operations struct GetR0_F32 : Expr<float> { public float Run( ref Registers reg, Span<long> frame, WasmInstance inst ) => BitConverter.Int32BitsToSingle( (int)reg.R0 ); } struct SetR2_F32<VALUE> : Stmt where VALUE : struct, Expr<float> { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { reg.R2 = BitConverter.SingleToUInt32Bits( default( VALUE ).Run( ref reg, frame, inst ) ); } }
long
s. The operations work exactly how you'd expect.struct SetFrame_F32<INDEX, VALUE> : Stmt where INDEX : struct, Const where VALUE : struct, Expr<float> { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { frame[(int)default( INDEX ).Run()] = BitConverter.SingleToUInt32Bits( default( VALUE ).Run( ref reg, frame, inst ) ); } }
stackalloc
ing a frame for each function, but doing this made function calls more complicated.F0
stores the return value, F1
stores the first argument, and F2
stores the second argument.struct StaticCall<FUNC_INDEX, FRAME_INDEX> : Stmt where FUNC_INDEX : struct, Const where FRAME_INDEX : struct, Const { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { var arg_span = frame.Slice( (int)default( FRAME_INDEX ).Run() ); var func = inst.Functions[default( FUNC_INDEX ).Run()]; func.Call( arg_span, inst ); } }
struct Memory_I32_Load<ADDR, OFFSET> : Expr<int> where ADDR : struct, Expr<int> where OFFSET : struct, Const { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public int Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { uint addr = (uint)default( ADDR ).Run( ref reg, frame, inst ); uint offset = (uint)default( OFFSET ).Run(); return BitConverter.ToInt32( inst.Memory.AsSpan( (int)checked(addr + offset) ) ); } } struct Memory_F64_Store<VALUE, ADDR, OFFSET> : Stmt where VALUE : struct, Expr<double> where ADDR : struct, Expr<int> where OFFSET : struct, Const { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { double value = default( VALUE ).Run( ref reg, frame, inst ); uint addr = (uint)default( ADDR ).Run( ref reg, frame, inst ); uint offset = (uint)default( OFFSET ).Run(); if ( !BitConverter.TryWriteBytes( inst.Memory.AsSpan( (int)checked(addr + offset) ), value ) ) { throw new IndexOutOfRangeException(); } } }
NEXT
parameter which a statement calls when it is finished. In many cases, a statement would fail to inline another statement of the same type. After some hacking on the dotnet source code, I discovered the culprit. The inliner has rules against recursion on large generic types. I don't fully understand the rationale for this, but it wasn't hard to fix. First, I use trees for statements instead of chains. I created a Stmts
bundle type that holds four statements.if
statements and while
loops, it would be easy. But in real code, we can enter several nested loops and conditionals, then break
, continue
, or return
from inside them. The Relooper and Stackifier algorithms were invented to solve these types of problems, but both require a break
construct. We could throw exceptions to handle these constructs, but that would be extremely expensive.memcpy
:struct DispatchLoop10< B0, B1, B2, B3, B4, B5, B6, B7, B8, B9 > : Stmt where B0 : struct, Terminator where B1 : struct, Terminator where B2 : struct, Terminator where B3 : struct, Terminator where B4 : struct, Terminator where B5 : struct, Terminator where B6 : struct, Terminator where B7 : struct, Terminator where B8 : struct, Terminator where B9 : struct, Terminator { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { for (; ; ) { switch ( reg.NextBlock ) { case 0: default( B0 ).Run( ref reg, frame, inst ); break; case 1: default( B1 ).Run( ref reg, frame, inst ); break; case 2: default( B2 ).Run( ref reg, frame, inst ); break; case 3: default( B3 ).Run( ref reg, frame, inst ); break; case 4: default( B4 ).Run( ref reg, frame, inst ); break; case 5: default( B5 ).Run( ref reg, frame, inst ); break; case 6: default( B6 ).Run( ref reg, frame, inst ); break; case 7: default( B7 ).Run( ref reg, frame, inst ); break; case 8: default( B8 ).Run( ref reg, frame, inst ); break; case 9: default( B9 ).Run( ref reg, frame, inst ); break; default: return; } } } }
Registers.NextBlock
to tell the loop what to do. Returns indicate a return by writing -1
.DispatchLoopArray
, which can hold an arbitrary number of terminators in an array.struct TermJumpTable<SEL, BASE, COUNT, BODY> : Terminator where SEL : struct, Expr<int> where BASE : struct, Const where COUNT : struct, Const where BODY : struct, Stmt { [MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )] public void Run( ref Registers reg, Span<long> frame, WasmInstance inst ) { default( BODY ).Run( ref reg, frame, inst ); uint sel = (uint)default( SEL ).Run( ref reg, frame, inst ); uint base_block = (uint)default( BASE ).Run(); uint max = (uint)default( COUNT ).Run() - 1; reg.NextBlock = (int)(base_block + uint.Min( sel, max )); } }
memcpy
function from earlier, with the previous section marked in green.if
need to lead to the same place. I was not able to summon the motivation to do this after many other failed optimization efforts. Which brings us to:Registers
struct to lower to native registers, while the control flow strategy might help or hinder inlining performance. In this section I'll list some ideas I had that didn't work out.MethodImplOptions.NoInlining
. I spent a lot of time on this, and it never seemed to help. Combining it with the structured control flow recovery only made it more difficult. I never considered the outer dispatch loop, which I suspect was a big mistake.System.Numerics.Vector
, and how fast they would run if so.long
. Maybe SIMD could use a pair of longs, but garbage collection actually needs reference types.