

while loop. WebAssembly allows you to jump around in a much more flexible fashion. It's still limited to structured control flow between blocks, but that only helps so much.(local_1 + local_2) / 2 // type f32 global_1 - 64 // type i32 memory[memory[local_1] * 8] + 3 // type i64 (the inner expression is type i32)
local_3 = (local_1 + local_2) / 2 // type f32 global_1 = global_1 - 64 // type i32 memory[local_0] = memory[memory[local_1] * 8] + 3 // type i64 (the inner expression is type i32) call foo
D0...DF, Num, and Neg structs similar to the BrainFlood article.SetR3_F32< Op_F32_Div< Op_F32_Add<GetR1_F32,GetR2_F32>, Const_F32< /* 2 */> >
// Globals use constant indices.
SetGlobal_I32<D1, Op_I32_Sub< GetGlobal_I32<D1>, Const_I32< /* 64 */> > >
// Memory operations have a constant offset as their last parameter.
Memory_I64_Store<
Op_I64_Add< Memory_I64_Load<
Op_I32_Mul< Memory_I32_Load< GetR1_I32, D0 >, Const_I32< /* 8 */> >,
D0 >, Const_I64< /* 3 */> >
, GetR0_I32, D0 >
// Calls will be explained in more depth later.
StaticCall<D8,DC>// Example expression definition
interface Expr<T>
{
T Run( ref Registers reg, Span<long> frame, WasmInstance inst );
}
struct Op_I64_Add<A, B> : Expr<long> where A : struct, Expr<long> where B : struct, Expr<long>
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public long Run( ref Registers reg, Span<long> frame, WasmInstance inst ) =>
default( A ).Run( ref reg, frame, inst ) + default( B ).Run( ref reg, frame, inst );
}
// Example statement definition
interface Stmt
{
void Run( ref Registers reg, Span<long> frame, WasmInstance inst );
}
struct SetGlobal_I32<INDEX, VALUE> : Stmt where INDEX : struct, Const where VALUE : struct, Expr<int>
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
inst.Globals[(int)default( INDEX ).Run()] =
(uint)default( VALUE ).Run( ref reg, frame, inst );
}
}F12 = I32_Add(F21,I32_Mul(F20,24)) F24 = M_I64[F12 + 16] F15 = M_I32[F12 + 12] F25 = M_I32[F12 + 8] F26 = M_I32[F12 + 4] F27 = M_I32_I8_U[F12 + 1] F23 = M_I32_I8_U[F12] M_I64[F1 + 112] = M_I64[F16 + 4] M_I32[F1 + 108] = F22 F35 = F3 F36 = I32_Add(F1,108) call _ZN4core4hash11BuildHasher8hash_one17hab058952bc790a7aE[34] F11 = F34 F28 = M_I32[F1 + 4] F12 = I32_Wrap_I64(F11) F21 = I32_And(F28,F12) F29 = I64_ShiftRight_U(F11,25) F10 = I64_Mul(I64_And(F29,127),72340172838076673) F30 = 0 F20 = M_I32[F1 + 116] F22 = M_I32[F1 + 112] F16 = M_I32[F1]
struct Registers
{
public long R0;
public long R1;
public long R2;
public long R3;
public long R4;
public long R5;
public long R6;
public int NextBlock;
}
// Sample f32 register operations
struct GetR0_F32 : Expr<float> {
public float Run( ref Registers reg, Span<long> frame, WasmInstance inst ) =>
BitConverter.Int32BitsToSingle( (int)reg.R0 );
}
struct SetR2_F32<VALUE> : Stmt where VALUE : struct, Expr<float>
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
reg.R2 = BitConverter.SingleToUInt32Bits( default( VALUE ).Run( ref reg, frame, inst ) );
}
}longs. The operations work exactly how you'd expect.struct SetFrame_F32<INDEX, VALUE> : Stmt where INDEX : struct, Const where VALUE : struct, Expr<float>
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
frame[(int)default( INDEX ).Run()] =
BitConverter.SingleToUInt32Bits( default( VALUE ).Run( ref reg, frame, inst ) );
}
}stackallocing a frame for each function, but doing this made function calls more complicated.F0 stores the return value, F1 stores the first argument, and F2 stores the second argument.struct StaticCall<FUNC_INDEX, FRAME_INDEX> : Stmt
where FUNC_INDEX : struct, Const
where FRAME_INDEX : struct, Const
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
var arg_span = frame.Slice( (int)default( FRAME_INDEX ).Run() );
var func = inst.Functions[default( FUNC_INDEX ).Run()];
func.Call( arg_span, inst );
}
}struct Memory_I32_Load<ADDR, OFFSET> : Expr<int>
where ADDR : struct, Expr<int>
where OFFSET : struct, Const
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public int Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
uint addr = (uint)default( ADDR ).Run( ref reg, frame, inst );
uint offset = (uint)default( OFFSET ).Run();
return BitConverter.ToInt32( inst.Memory.AsSpan( (int)checked(addr + offset) ) );
}
}
struct Memory_F64_Store<VALUE, ADDR, OFFSET> : Stmt
where VALUE : struct, Expr<double>
where ADDR : struct, Expr<int>
where OFFSET : struct, Const
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
double value = default( VALUE ).Run( ref reg, frame, inst );
uint addr = (uint)default( ADDR ).Run( ref reg, frame, inst );
uint offset = (uint)default( OFFSET ).Run();
if ( !BitConverter.TryWriteBytes( inst.Memory.AsSpan( (int)checked(addr + offset) ), value ) )
{
throw new IndexOutOfRangeException();
}
}
}
NEXT parameter which a statement calls when it is finished. In many cases, a statement would fail to inline another statement of the same type. After some hacking on the dotnet source code, I discovered the culprit. The inliner has rules against recursion on large generic types. I don't fully understand the rationale for this, but it wasn't hard to fix. First, I use trees for statements instead of chains. I created a Stmts bundle type that holds four statements.

if statements and while loops, it would be easy. But in real code, we can enter several nested loops and conditionals, then break, continue, or return from inside them. The Relooper and Stackifier algorithms were invented to solve these types of problems, but both require a break construct. We could throw exceptions to handle these constructs, but that would be extremely expensive.memcpy:
struct DispatchLoop10<
B0, B1, B2, B3, B4, B5, B6, B7, B8, B9
> : Stmt
where B0 : struct, Terminator where B1 : struct, Terminator where B2 : struct, Terminator where B3 : struct, Terminator where B4 : struct, Terminator
where B5 : struct, Terminator where B6 : struct, Terminator where B7 : struct, Terminator where B8 : struct, Terminator where B9 : struct, Terminator
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
for (; ; )
{
switch ( reg.NextBlock )
{
case 0: default( B0 ).Run( ref reg, frame, inst ); break;
case 1: default( B1 ).Run( ref reg, frame, inst ); break;
case 2: default( B2 ).Run( ref reg, frame, inst ); break;
case 3: default( B3 ).Run( ref reg, frame, inst ); break;
case 4: default( B4 ).Run( ref reg, frame, inst ); break;
case 5: default( B5 ).Run( ref reg, frame, inst ); break;
case 6: default( B6 ).Run( ref reg, frame, inst ); break;
case 7: default( B7 ).Run( ref reg, frame, inst ); break;
case 8: default( B8 ).Run( ref reg, frame, inst ); break;
case 9: default( B9 ).Run( ref reg, frame, inst ); break;
default: return;
}
}
}
}Registers.NextBlock to tell the loop what to do. Returns indicate a return by writing -1.DispatchLoopArray, which can hold an arbitrary number of terminators in an array.struct TermJumpTable<SEL, BASE, COUNT, BODY> : Terminator
where SEL : struct, Expr<int>
where BASE : struct, Const
where COUNT : struct, Const
where BODY : struct, Stmt
{
[MethodImpl( MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization )]
public void Run( ref Registers reg, Span<long> frame, WasmInstance inst )
{
default( BODY ).Run( ref reg, frame, inst );
uint sel = (uint)default( SEL ).Run( ref reg, frame, inst );
uint base_block = (uint)default( BASE ).Run();
uint max = (uint)default( COUNT ).Run() - 1;
reg.NextBlock = (int)(base_block + uint.Min( sel, max ));
}
}
memcpy function from earlier, with the previous section marked in green.
if need to lead to the same place. I was not able to summon the motivation to do this after many other failed optimization efforts. Which brings us to:Registers struct to lower to native registers, while the control flow strategy might help or hinder inlining performance. In this section I'll list some ideas I had that didn't work out.
MethodImplOptions.NoInlining. I spent a lot of time on this, and it never seemed to help. Combining it with the structured control flow recovery only made it more difficult. I never considered the outer dispatch loop, which I suspect was a big mistake.System.Numerics.Vector, and how fast they would run if so.long. Maybe SIMD could use a pair of longs, but garbage collection actually needs reference types.