SharpTalkSpeaker.cs
using System;
using System.Threading.Tasks;
using SharpTalk;

namespace Sandbox;

public sealed class SharpTalkSpeaker : Component
{
	[Property] public string SpeakOnStart { get; set; } = "";

	[Property, Range( 40, 600 )] public int Rate { get; set; } = 200;
	[Property, Range( 40, 500 )] public int PitchHz { get; set; } = 122;
	[Property, Range( 0, 100 )] public int VoiceVolume { get; set; } = 80;
	[Property, Range( 0f, 2f )] public float AudioVolume { get; set; } = 1f;

	/// <summary>Fired on the main thread as each phoneme starts playing.</summary>
	public event Action<PhonemeEvent> OnPhoneme;

	TtsEngine _engine;
	SoundHandle _handle;
	bool _speaking;
	PhonemeEvent[] _phonemeEvents = Array.Empty<PhonemeEvent>();
	int _nextPhonemeIndex;

	public bool IsSpeaking => _speaking;

	protected override void OnStart()
	{
		try { InitEngine(); }
		catch ( Exception e ) { Log.Error( $"SharpTalkSpeaker: OnStart threw — {e}" ); }
	}

	void InitEngine()
	{
		var dict    = FileSystem.Mounted.ReadAllBytes( "sharptalk/english_lex.bin" ).ToArray();
		var symbols = FileSystem.Mounted.ReadAllBytes( "sharptalk/symbols.bin" ).ToArray();

		if ( dict.Length == 0 )    { Log.Error( "SharpTalkSpeaker: english_lex.bin not found or empty" ); return; }
		if ( symbols.Length == 0 ) { Log.Error( "SharpTalkSpeaker: symbols.bin not found or empty" ); return; }

		var voice = VoiceData.BaselineVoice;
		voice.Rate   = (short)Rate;
		voice.PitchHz = (short)PitchHz;
		voice.VGain  = (short)VoiceVolume;

		_engine = new TtsEngine( voice, dict, symbols );
		Log.Info( $"SharpTalkSpeaker: engine initialized (dict={dict.Length}b, symbols={symbols.Length}b)" );

		if ( !string.IsNullOrWhiteSpace( SpeakOnStart ) )
			_ = Speak( SpeakOnStart );
	}

	public async Task Speak( string text )
	{
		if ( _engine is null ) { Log.Error( "SharpTalkSpeaker: Speak() called but engine is null — was OnStart run?" ); return; }

		Stop();
		_speaking = true;

		Log.Info( $"SharpTalkSpeaker: synthesizing \"{text}\"" );
		short[] samples;
		PhonemeEvent[] events;
		try
		{
			(samples, events) = await GameTask.RunInThreadAsync( () => _engine.SpeakWithEvents( text ) );
		}
		catch ( Exception e )
		{
			Log.Error( $"SharpTalkSpeaker: synthesis threw — {e}" );
			_speaking = false;
			return;
		}
		_phonemeEvents = events;
		_nextPhonemeIndex = 0;
		short peak = 0;
		foreach ( var s in samples ) { var a = Math.Abs( (int)s ); if ( a > peak ) peak = (short)a; }
		Log.Info( $"SharpTalkSpeaker: got {samples.Length} samples, {events.Length} phoneme events, peak={peak}, streaming…" );

		using var stream = new SoundStream( TtsEngine.SampleRate, 1 );
		_handle = stream.Play( AudioVolume, 1f );
		_handle.SpacialBlend = 0f;
		_handle.DistanceAttenuation = false;
		_handle.Occlusion = false;
		_handle.SetParent( GameObject );
		_handle.FollowParent = true;
		_handle.Update();

		int offset = 0;
		while ( offset < samples.Length )
		{
			int space = stream.MaxWriteSampleCount - stream.QueuedSampleCount;
			if ( space <= 0 )
			{
				await GameTask.Delay( 5 );
				continue;
			}
			int count = Math.Min( space, samples.Length - offset );
			stream.WriteData( samples.AsSpan( offset, count ) );
			offset += count;
		}

		stream.Close();
		Log.Info( "SharpTalkSpeaker: done" );
		_speaking = false;
	}

	protected override void OnUpdate()
	{
		if ( OnPhoneme is null || _nextPhonemeIndex >= _phonemeEvents.Length ) return;
		if ( _handle is null || !_handle.IsValid ) return;

		float t = _handle.ElapsedTime;
		while ( _nextPhonemeIndex < _phonemeEvents.Length && _phonemeEvents[_nextPhonemeIndex].TimeSeconds <= t )
			OnPhoneme?.Invoke( _phonemeEvents[_nextPhonemeIndex++] );
	}

	public void Stop()
	{
		if ( _handle != null && _handle.IsValid && _handle.IsPlaying )
			_handle.Stop( 0f );
		_speaking = false;
	}

	public void SetVoice( bool whisper )
	{
		if ( _engine is null ) return;

		var voice = whisper ? VoiceData.WhisperVoice : VoiceData.BaselineVoice;
		voice.Rate = (short)Rate;
		voice.PitchHz = (short)PitchHz;
		voice.VGain = (short)VoiceVolume;

		_engine.Voice = voice;
	}

	public void ApplyVoice()
	{
		if ( _engine is null ) return;
		var v = _engine.Voice;
		v.Rate    = (short)Rate;
		v.PitchHz = (short)PitchHz;
		v.VGain   = (short)VoiceVolume;
		_engine.Voice = v;
	}
}