Editor utility that manages a single Vast.ai rig session lifecycle. It can rent an instance, write a local ledger, wait for the remote server to boot, upload a mesh, trigger a remote rig/provision, poll status, download results, and finally destroy the instance (with verified destruction), or operate on an existing instance without renting.
using System;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using AutoRig.Analyze;
using AutoRig.Rig;
using AutoRig.Vast;
using global::Editor;
using Sandbox;
namespace Editor.AutoRig.Vast;
/// <summary>
/// One cloud rig, cradle to grave: rent → ledger → boot → upload → rig →
/// download → DESTROY (verified, in finally). The ledger records exactly the
/// instance we created; if verified destruction fails the ledger survives so
/// the stale-rental prompt can finish the job next time - the user's other
/// instances are never enumerated, never touched.
/// </summary>
public sealed class VastRigSession
{
public static string LedgerPath => Path.Combine(
Project.Current?.GetAssetsPath() ?? ".", "autorig_dl", "vast_rented.json" );
/// <summary>Adjustable via the settings dialog (VastSettings.Apply on load).</summary>
public static TimeSpan BootTimeout { get; set; } = TimeSpan.FromMinutes( 12 );
public static TimeSpan RigTimeout { get; set; } = TimeSpan.FromMinutes( 30 );
readonly VastClient _client;
readonly Action<string> _progress;
public VastRigSession( VastClient client, Action<string> progress )
{
_client = client;
_progress = progress ?? (_ => { });
}
/// <summary>Null when no rental is outstanding.</summary>
public static VastRental StaleRental()
=> File.Exists( LedgerPath ) ? VastRental.Parse( File.ReadAllText( LedgerPath ) ) : null;
/// <summary>Destroys a stale rental (verified); clears the ledger on success.</summary>
public async Task<bool> DestroyStale( VastRental rental )
{
var gone = await _client.DestroyVerified( rental.InstanceId );
if ( gone )
ClearLedger();
return gone;
}
public async Task<RigResult> RigAsync(
AnalysisResult analysis, string modelId, string modelTitle, VastOffer offer )
{
var objBytes = ObjWriter.Write( analysis.Mesh );
_progress( $"renting {offer.GpuName} (${offer.DollarsPerHour:0.00}/hr)…" );
var instanceId = await _client.CreateInstance(
offer.Id, VastProtocol.DockerImage,
VastProtocol.BuildOnStart( modelId ), VastProtocol.DiskGb );
// Ownership ledger BEFORE anything else can fail.
Directory.CreateDirectory( Path.GetDirectoryName( LedgerPath )! );
File.WriteAllText( LedgerPath, new VastRental
{
InstanceId = instanceId,
OfferId = offer.Id,
CreatedUtc = DateTime.UtcNow.ToString( "o" ),
Label = modelTitle,
}.Serialize() );
try
{
var endpoint = await WaitForBoot( instanceId );
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };
_progress( "uploading mesh…" );
var upload = await http.PostAsync(
$"{endpoint}/rig?model={Uri.EscapeDataString( modelId )}",
new ByteArrayContent( objBytes ) );
upload.EnsureSuccessStatusCode();
_progress( "rigging remotely…" );
var deadline = DateTime.UtcNow + RigTimeout;
while ( true )
{
if ( DateTime.UtcNow > deadline )
throw new TimeoutException( "remote rig timed out." );
await Task.Delay( TimeSpan.FromSeconds( 10 ) );
var status = await http.GetStringAsync( $"{endpoint}/status" );
if ( status.Contains( "\"done\"" ) || status.Contains( "\"error\"" ) )
break;
}
_progress( "downloading result…" );
var resultJson = await http.GetStringAsync( $"{endpoint}/result" );
var remote = VastProtocol.ParseResult( resultJson ); // throws w/ remote log on error
return VastProtocol.ToRigResult( analysis, remote, modelTitle );
}
finally
{
_progress( $"destroying instance {instanceId}…" );
var destroyed = await _client.DestroyVerified( instanceId );
if ( destroyed )
{
ClearLedger();
_progress( $"instance {instanceId} DESTROYED (verified)." );
}
else
{
_progress( $"WARNING: could not verify destruction of instance {instanceId} - "
+ "it stays in the ledger; you will be prompted to retry. "
+ "Check console.vast.ai to avoid charges." );
}
}
}
/// <summary>Offloads a rig onto an instance the user ALREADY has running:
/// upload → rig → download. By default it NEVER rents, NEVER destroys, and
/// NEVER writes the ledger - the box keeps running afterwards (the point of
/// offload is to reuse a machine that already has the model installed).
/// When <paramref name="destroyAfter"/> is set the caller has explicitly
/// opted in to destroying THIS instance (verified) once the rig is done -
/// still only ever the exact id passed here.</summary>
public async Task<RigResult> RigOnExisting(
AnalysisResult analysis, string modelId, string modelTitle, long instanceId,
bool destroyAfter = false )
{
var objBytes = ObjWriter.Write( analysis.Mesh );
try
{
_progress( $"connecting to instance {instanceId}…" );
var endpoint = await WaitForBoot( instanceId ); // health-poll, reused as-is
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };
// The box auto-provisions the model if it isn't installed yet.
_progress( "uploading mesh…" );
var upload = await http.PostAsync(
$"{endpoint}/rig?model={Uri.EscapeDataString( modelId )}",
new ByteArrayContent( objBytes ) );
upload.EnsureSuccessStatusCode();
_progress( "rigging on your instance…" );
var deadline = DateTime.UtcNow + RigTimeout;
while ( true )
{
if ( DateTime.UtcNow > deadline )
throw new TimeoutException( "remote rig timed out." );
await Task.Delay( TimeSpan.FromSeconds( 10 ) );
var status = await http.GetStringAsync( $"{endpoint}/status" );
if ( status.Contains( "\"done\"" ) || status.Contains( "\"error\"" ) )
break;
}
_progress( "downloading result…" );
var resultJson = await http.GetStringAsync( $"{endpoint}/result" );
var remote = VastProtocol.ParseResult( resultJson ); // throws w/ remote log on error
return VastProtocol.ToRigResult( analysis, remote, modelTitle );
}
finally
{
if ( destroyAfter )
{
_progress( $"destroying instance {instanceId} (destroy-after-rig)…" );
_progress( await _client.DestroyVerified( instanceId )
? $"instance {instanceId} destroyed (verified)."
: $"WARNING: could not verify destruction of {instanceId} - check console.vast.ai." );
}
else
{
_progress( $"done - instance {instanceId} left running (offload never destroys it)." );
}
}
}
/// <summary>Installs a model onto an instance the user already has running
/// (POST /provision), polling until the box reports it provisioned. Never
/// rents, never destroys - it just makes the box ready to offload that model
/// (a box can hold several). Downloads can be slow, so it waits generously.</summary>
public async Task ProvisionOnExisting( long instanceId, string modelId )
{
_progress( $"connecting to instance {instanceId}…" );
var endpoint = await WaitForBoot( instanceId );
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes( 5 ) };
_progress( $"provisioning {modelId} (cloning repo + downloading checkpoints)…" );
var post = await http.PostAsync(
$"{endpoint}/provision?model={Uri.EscapeDataString( modelId )}",
new ByteArrayContent( Array.Empty<byte>() ) );
post.EnsureSuccessStatusCode();
var deadline = DateTime.UtcNow + RigTimeout;
while ( true )
{
if ( DateTime.UtcNow > deadline )
throw new TimeoutException( "remote provision timed out." );
await Task.Delay( TimeSpan.FromSeconds( 10 ) );
var s = await http.GetStringAsync( $"{endpoint}/status" );
if ( s.Contains( "\"provisioned" ) )
{
_progress( $"{modelId} provisioned - you can offload rigs to this box now." );
return;
}
if ( s.Contains( "\"error\"" ) )
throw new FormatException( $"remote provision failed: {s}" );
}
}
async Task<string> WaitForBoot( long instanceId )
{
_progress( "waiting for the instance to boot…" );
var deadline = DateTime.UtcNow + BootTimeout;
using var http = new HttpClient { Timeout = TimeSpan.FromSeconds( 10 ) };
while ( DateTime.UtcNow < deadline )
{
await Task.Delay( TimeSpan.FromSeconds( 10 ) );
var state = await _client.GetInstance( instanceId );
if ( state is null || state.ActualStatus != "running"
|| state.PublicIp.Length == 0
|| !state.Ports.TryGetValue( VastProtocol.ServerPort, out var hostPort ) )
continue;
var endpoint = $"http://{state.PublicIp}:{hostPort}";
try
{
if ( (await http.GetStringAsync( $"{endpoint}/health" )).Contains( "ready" ) )
{
_progress( "instance is up." );
return endpoint;
}
}
catch { /* server not up yet */ }
}
throw new TimeoutException( "vast.ai instance did not become ready in time." );
}
static void ClearLedger()
{
if ( File.Exists( LedgerPath ) )
File.Delete( LedgerPath );
}
}