Editor/AutoRig/Vast/VastRigSession.cs

Editor utility that manages a single Vast.ai rig session lifecycle. It can rent an instance, write a local ledger, wait for the remote server to boot, upload a mesh, trigger a remote rig/provision, poll status, download results, and finally destroy the instance (with verified destruction), or operate on an existing instance without renting.

NetworkingFile AccessHttp Calls
using System;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using AutoRig.Analyze;
using AutoRig.Rig;
using AutoRig.Vast;
using global::Editor;
using Sandbox;

namespace Editor.AutoRig.Vast;

/// <summary>
/// One cloud rig, cradle to grave: rent → ledger → boot → upload → rig →
/// download → DESTROY (verified, in finally). The ledger records exactly the
/// instance we created; if verified destruction fails the ledger survives so
/// the stale-rental prompt can finish the job next time - the user's other
/// instances are never enumerated, never touched.
/// </summary>
public sealed class VastRigSession
{
    public static string LedgerPath => Path.Combine(
        Project.Current?.GetAssetsPath() ?? ".", "autorig_dl", "vast_rented.json" );

    /// <summary>Adjustable via the settings dialog (VastSettings.Apply on load).</summary>
    public static TimeSpan BootTimeout { get; set; } = TimeSpan.FromMinutes( 12 );
    public static TimeSpan RigTimeout { get; set; } = TimeSpan.FromMinutes( 30 );

    readonly VastClient _client;
    readonly Action<string> _progress;

    public VastRigSession( VastClient client, Action<string> progress )
    {
        _client = client;
        _progress = progress ?? (_ => { });
    }

    /// <summary>Null when no rental is outstanding.</summary>
    public static VastRental StaleRental()
        => File.Exists( LedgerPath ) ? VastRental.Parse( File.ReadAllText( LedgerPath ) ) : null;

    /// <summary>Destroys a stale rental (verified); clears the ledger on success.</summary>
    public async Task<bool> DestroyStale( VastRental rental )
    {
        var gone = await _client.DestroyVerified( rental.InstanceId );
        if ( gone )
            ClearLedger();
        return gone;
    }

    public async Task<RigResult> RigAsync(
        AnalysisResult analysis, string modelId, string modelTitle, VastOffer offer )
    {
        var objBytes = ObjWriter.Write( analysis.Mesh );

        _progress( $"renting {offer.GpuName} (${offer.DollarsPerHour:0.00}/hr)…" );
        var instanceId = await _client.CreateInstance(
            offer.Id, VastProtocol.DockerImage,
            VastProtocol.BuildOnStart( modelId ), VastProtocol.DiskGb );

        // Ownership ledger BEFORE anything else can fail.
        Directory.CreateDirectory( Path.GetDirectoryName( LedgerPath )! );
        File.WriteAllText( LedgerPath, new VastRental
        {
            InstanceId = instanceId,
            OfferId = offer.Id,
            CreatedUtc = DateTime.UtcNow.ToString( "o" ),
            Label = modelTitle,
        }.Serialize() );

        try
        {
            var endpoint = await WaitForBoot( instanceId );
            using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };

            _progress( "uploading mesh…" );
            var upload = await http.PostAsync(
                $"{endpoint}/rig?model={Uri.EscapeDataString( modelId )}",
                new ByteArrayContent( objBytes ) );
            upload.EnsureSuccessStatusCode();

            _progress( "rigging remotely…" );
            var deadline = DateTime.UtcNow + RigTimeout;
            while ( true )
            {
                if ( DateTime.UtcNow > deadline )
                    throw new TimeoutException( "remote rig timed out." );
                await Task.Delay( TimeSpan.FromSeconds( 10 ) );
                var status = await http.GetStringAsync( $"{endpoint}/status" );
                if ( status.Contains( "\"done\"" ) || status.Contains( "\"error\"" ) )
                    break;
            }

            _progress( "downloading result…" );
            var resultJson = await http.GetStringAsync( $"{endpoint}/result" );
            var remote = VastProtocol.ParseResult( resultJson );   // throws w/ remote log on error
            return VastProtocol.ToRigResult( analysis, remote, modelTitle );
        }
        finally
        {
            _progress( $"destroying instance {instanceId}…" );
            var destroyed = await _client.DestroyVerified( instanceId );
            if ( destroyed )
            {
                ClearLedger();
                _progress( $"instance {instanceId} DESTROYED (verified)." );
            }
            else
            {
                _progress( $"WARNING: could not verify destruction of instance {instanceId} - "
                    + "it stays in the ledger; you will be prompted to retry. "
                    + "Check console.vast.ai to avoid charges." );
            }
        }
    }

    /// <summary>Offloads a rig onto an instance the user ALREADY has running:
    /// upload → rig → download. By default it NEVER rents, NEVER destroys, and
    /// NEVER writes the ledger - the box keeps running afterwards (the point of
    /// offload is to reuse a machine that already has the model installed).
    /// When <paramref name="destroyAfter"/> is set the caller has explicitly
    /// opted in to destroying THIS instance (verified) once the rig is done -
    /// still only ever the exact id passed here.</summary>
    public async Task<RigResult> RigOnExisting(
        AnalysisResult analysis, string modelId, string modelTitle, long instanceId,
        bool destroyAfter = false )
    {
        var objBytes = ObjWriter.Write( analysis.Mesh );
        try
        {
            _progress( $"connecting to instance {instanceId}…" );
            var endpoint = await WaitForBoot( instanceId );   // health-poll, reused as-is
            using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };

            // The box auto-provisions the model if it isn't installed yet.
            _progress( "uploading mesh…" );
            var upload = await http.PostAsync(
                $"{endpoint}/rig?model={Uri.EscapeDataString( modelId )}",
                new ByteArrayContent( objBytes ) );
            upload.EnsureSuccessStatusCode();

            _progress( "rigging on your instance…" );
            var deadline = DateTime.UtcNow + RigTimeout;
            while ( true )
            {
                if ( DateTime.UtcNow > deadline )
                    throw new TimeoutException( "remote rig timed out." );
                await Task.Delay( TimeSpan.FromSeconds( 10 ) );
                var status = await http.GetStringAsync( $"{endpoint}/status" );
                if ( status.Contains( "\"done\"" ) || status.Contains( "\"error\"" ) )
                    break;
            }

            _progress( "downloading result…" );
            var resultJson = await http.GetStringAsync( $"{endpoint}/result" );
            var remote = VastProtocol.ParseResult( resultJson );   // throws w/ remote log on error
            return VastProtocol.ToRigResult( analysis, remote, modelTitle );
        }
        finally
        {
            if ( destroyAfter )
            {
                _progress( $"destroying instance {instanceId} (destroy-after-rig)…" );
                _progress( await _client.DestroyVerified( instanceId )
                    ? $"instance {instanceId} destroyed (verified)."
                    : $"WARNING: could not verify destruction of {instanceId} - check console.vast.ai." );
            }
            else
            {
                _progress( $"done - instance {instanceId} left running (offload never destroys it)." );
            }
        }
    }

    /// <summary>Installs a model onto an instance the user already has running
    /// (POST /provision), polling until the box reports it provisioned. Never
    /// rents, never destroys - it just makes the box ready to offload that model
    /// (a box can hold several). Downloads can be slow, so it waits generously.</summary>
    public async Task ProvisionOnExisting( long instanceId, string modelId )
    {
        _progress( $"connecting to instance {instanceId}…" );
        var endpoint = await WaitForBoot( instanceId );
        using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };

        _progress( $"provisioning {modelId} (cloning repo + downloading checkpoints)…" );
        var post = await http.PostAsync(
            $"{endpoint}/provision?model={Uri.EscapeDataString( modelId )}",
            new ByteArrayContent( Array.Empty<byte>() ) );
        post.EnsureSuccessStatusCode();

        var deadline = DateTime.UtcNow + RigTimeout;
        while ( true )
        {
            if ( DateTime.UtcNow > deadline )
                throw new TimeoutException( "remote provision timed out." );
            await Task.Delay( TimeSpan.FromSeconds( 10 ) );
            var s = await http.GetStringAsync( $"{endpoint}/status" );
            if ( s.Contains( "\"provisioned" ) )
            {
                _progress( $"{modelId} provisioned - you can offload rigs to this box now." );
                return;
            }
            if ( s.Contains( "\"error\"" ) )
                throw new FormatException( $"remote provision failed: {s}" );
        }
    }

    async Task<string> WaitForBoot( long instanceId )
    {
        _progress( "waiting for the instance to boot…" );
        var deadline = DateTime.UtcNow + BootTimeout;
        using var http = new HttpClient { Timeout = TimeSpan.FromSeconds( 10 ) };
        while ( DateTime.UtcNow < deadline )
        {
            await Task.Delay( TimeSpan.FromSeconds( 10 ) );
            var state = await _client.GetInstance( instanceId );
            if ( state is null || state.ActualStatus != "running"
                || state.PublicIp.Length == 0
                || !state.Ports.TryGetValue( VastProtocol.ServerPort, out var hostPort ) )
                continue;

            var endpoint = $"http://{state.PublicIp}:{hostPort}";
            try
            {
                if ( (await http.GetStringAsync( $"{endpoint}/health" )).Contains( "ready" ) )
                {
                    _progress( "instance is up." );
                    return endpoint;
                }
            }
            catch { /* server not up yet */ }
        }
        throw new TimeoutException( "vast.ai instance did not become ready in time." );
    }

    static void ClearLedger()
    {
        if ( File.Exists( LedgerPath ) )
            File.Delete( LedgerPath );
    }
}