Code/SharpTalkSpeaker.cs
using System;
using System.Threading.Tasks;
using Sandbox;
using SharpTalk;

namespace SharpTalk;

public enum VoicePreset { Baseline, Whisper, Custom }

public sealed class SharpTalkSpeaker : Component
{
	[Property] public string SpeakOnStart { get; set; } = "";

	[Property, Range( 40, 600 )] public int Rate    { get; set; } = 200;
	[Property, Range( 40, 500 )] public int PitchHz { get; set; } = 122;
	[Property, Range( 0.5f, 2.0f )] public float TractScale { get; set; } = 1.0f;
	[Property, Range( 0f, 2f  )] public float AudioVolume { get; set; } = 1f;

	[Property, Group( "Klattsch Mode" )] public bool KlattschMode { get; set; } = false;
	[Property, Group( "Klattsch Mode" ), Range( 70, 320 )] public float KlBaseF0 { get; set; } = 120f;
	[Property, Group( "Klattsch Mode" ), Range( 60, 500 )] public float KlRate { get; set; } = 110f;
	[Property, Group( "Klattsch Mode" ), Range( 0, 15 )] public float KlVibrato { get; set; } = 0f;
	[Property, Group( "Klattsch Mode" ), Range( 2, 10 )] public float KlVibRate { get; set; } = 5f;
	[Property, Group( "Klattsch Mode" ), Range( 0, 1 )] public float KlAsp { get; set; } = 0f;
	[Property, Group( "Klattsch Mode" ), Range( -0.9f, 0.9f )] public float KlTilt { get; set; } = 0f;
	[Property, Group( "Klattsch Mode" ), Range( 0, 1 )] public float KlEffort { get; set; } = 0.5f;


	bool _applyingPreset;

	VoicePreset _preset = VoicePreset.Baseline;
	[Property, Group( "1. Voice Definition" )]
	public VoicePreset Preset
	{
		get => _preset;
		set
		{
			_preset = value;
			if ( value == VoicePreset.Custom ) return;

			_applyingPreset = true;
			var v = value == VoicePreset.Whisper ? VoiceData.WhisperVoice : VoiceData.BaselineVoice;
			Female          = v.VoiceType == 1;
			TractScale      = v.TractScale;
			VoicingGain     = v.VGain;
			AspirationGain  = v.AGain;
			AspirationCycle = v.ACycle;
			TremoloDepth    = v.TremoloDepth;
			TremoloRate     = v.TremoloRate;
			F4Freq          = v.F4Freq;
			F4BW            = v.F4BW;
			F5Freq          = v.F5Freq;
			F5BW            = v.F5BW;
			F4pFreq         = v.F4pFreq;
			F4pBW           = v.F4pBW;
			F5pFreq         = v.F5pFreq;
			F5pBW           = v.F5pBW;
			F6pFreq         = v.F6pFreq;
			F6pBW           = v.F6pBW;
			BwGain1         = v.BwGain1;
			BwGain2         = v.BwGain2;
			BwGain3         = v.BwGain3;
			NasalBase       = v.NasalBase;
			NasalTarg       = v.NasalTarg;
			NasalBW         = v.NasalBW;
			NGain           = v.NGain;
			PitchRange      = v.PitchRange;
			StressGain      = v.StressGain;
			Intonation      = v.Intonation;
			RiseAmt         = v.RiseAmt;
			FallAmt         = v.FallAmt;
			BaselineFall    = v.BaselineFall;
			_applyingPreset = false;
		}
	}

	void MarkCustom() { if ( !_applyingPreset ) _preset = VoicePreset.Custom; }

	bool _female = false;
	[Property, Group( "1. Voice Definition" ), Description( "Shifts the vocal tract character towards a female voice." )]
	public bool Female { get => _female; set { _female = value; MarkCustom(); } }

	int _voicingGain = 100;
	[Property, Group( "1. Voice Definition" ), Range( 0, 100 ), Description( "Strength of vocal cord vibration. Lower values make the voice sound weaker or breathier." )]
	public int VoicingGain     { get => _voicingGain;     set { _voicingGain     = value; MarkCustom(); } }

	int _aspirationGain = 0;
	[Property, Group( "1. Voice Definition" ), Range( 0, 500 ), Description( "Amount of breathy air noise in the voice. High values produce a whisper-like effect." )]
	public int AspirationGain  { get => _aspirationGain;  set { _aspirationGain  = value; MarkCustom(); } }

	int _aspirationCycle = 192;
	[Property, Group( "1. Voice Definition" ), Range( 0, 255 ), Description( "Rhythm of the breathiness. Lower values make the breath noise more continuous and even." )]
	public int AspirationCycle { get => _aspirationCycle; set { _aspirationCycle = value; MarkCustom(); } }

	int _tremoloDepth = 0;
	[Property, Group( "1. Voice Definition" ), Range( 0, 100 ), Description( "Depth of amplitude tremolo. Adds vocal growl/gravel." )]
	public int TremoloDepth { get => _tremoloDepth; set { _tremoloDepth = value; MarkCustom(); } }

	int _tremoloRate = 0;
	[Property, Group( "1. Voice Definition" ), Range( 0, 200 ), Description( "Rate of tremolo oscillation (x0.1 Hz)." )]
	public int TremoloRate { get => _tremoloRate; set { _tremoloRate = value; MarkCustom(); } }


	int _f5Freq = 4500;
	[Property, Group( "2. Formants" ), Range( 2000, 8000 )]
	public int F5Freq { get => _f5Freq; set { _f5Freq = value; MarkCustom(); } }

	int _f5BW = 250;
	[Property, Group( "2. Formants" ), Range( 50, 2048 )]
	public int F5BW { get => _f5BW; set { _f5BW = value; MarkCustom(); } }

	int _f4Freq = 3000;
	[Property, Group( "2. Formants" ), Range( 1000, 6000 ), Description( "Brightness or 'ring' of the voice. Higher values sound sharper and more present." )]
	public int F4Freq  { get => _f4Freq;  set { _f4Freq  = value; MarkCustom(); } }

	int _f4BW = 200;
	[Property, Group( "2. Formants" ), Range( 10, 1000 ), Description( "Focus of the brightness peak. Lower values sound more resonant; higher values are softer and more diffuse." )]
	public int F4BW    { get => _f4BW;    set { _f4BW    = value; MarkCustom(); } }

	int _f4pFreq = 3600;
	[Property, Group( "2. Formants" ), Range( 1000, 6000 ), Description( "High-frequency 'sheen' of the voice. Shapes the upper brightness character." )]
	public int F4pFreq { get => _f4pFreq; set { _f4pFreq = value; MarkCustom(); } }

	int _f4pBW = 150;
	[Property, Group( "2. Formants" ), Range( 10, 500 ), Description( "Focus of the upper brightness sheen." )]
	public int F4pBW   { get => _f4pBW;   set { _f4pBW   = value; MarkCustom(); } }

	int _f5pFreq = 3750;
	[Property, Group( "2. Formants" ), Range( 1000, 6000 ), Description( "Airy high-frequency resonance. Contributes to openness and air in the upper range." )]
	public int F5pFreq { get => _f5pFreq; set { _f5pFreq = value; MarkCustom(); } }

	int _f5pBW = 100;
	[Property, Group( "2. Formants" ), Range( 10, 500 ), Description( "Focus of the airy high-frequency resonance." )]
	public int F5pBW   { get => _f5pBW;   set { _f5pBW   = value; MarkCustom(); } }

	int _f6pFreq = 4500;
	[Property, Group( "2. Formants" ), Range( 1000, 8000 ), Description( "Very high resonance that adds subtle air and presence at the top of the spectrum." )]
	public int F6pFreq { get => _f6pFreq; set { _f6pFreq = value; MarkCustom(); } }

	int _f6pBW = 150;
	[Property, Group( "2. Formants" ), Range( 10, 500 ), Description( "Focus of the very high resonance." )]
	public int F6pBW   { get => _f6pBW;   set { _f6pBW   = value; MarkCustom(); } }

	int _bwGain1 = 150;
	[Property, Group( "2. Formants" ), Range( 0, 300 ), Description( "Resonance damping in the low range. Affects how open and warm low vowels sound." )]
	public int BwGain1 { get => _bwGain1; set { _bwGain1 = value; MarkCustom(); } }

	int _bwGain2 = 100;
	[Property, Group( "2. Formants" ), Range( 0, 300 ), Description( "Resonance damping in the mid range. Affects clarity of mid vowels." )]
	public int BwGain2 { get => _bwGain2; set { _bwGain2 = value; MarkCustom(); } }

	int _bwGain3 = 100;
	[Property, Group( "2. Formants" ), Range( 0, 300 ), Description( "Resonance damping in the upper-mid range." )]
	public int BwGain3 { get => _bwGain3; set { _bwGain3 = value; MarkCustom(); } }


	int _nasalBase = 330;
	[Property, Group( "3. Nasal" ), Range( 100, 600 ), Description( "Starting character of nasal sounds (m, n, ng). Shapes how nasals begin." )]
	public int NasalBase { get => _nasalBase; set { _nasalBase = value; MarkCustom(); } }

	int _nasalTarg = 400;
	[Property, Group( "3. Nasal" ), Range( 100, 600 ), Description( "Target character of nasal sounds. Shapes how fully-developed nasals feel." )]
	public int NasalTarg { get => _nasalTarg; set { _nasalTarg = value; MarkCustom(); } }

	int _nasalBW = 60;
	[Property, Group( "3. Nasal" ), Range( 10, 200 ), Description( "How sharp or diffuse the nasal resonance sounds." )]
	public int NasalBW   { get => _nasalBW;   set { _nasalBW   = value; MarkCustom(); } }

	int _nGain = 100;
	[Property, Group( "3. Nasal" ), Range( 0, 500 ), Description( "Overall nasal character of the voice. Higher values make the voice sound more nasal." )]
	public int NGain     { get => _nGain;     set { _nGain     = value; MarkCustom(); } }


	int _pitchRange = 100;
	[Property, Group( "4. Intonation" ), Range( 0, 200 ), Description( "How much the pitch varies while speaking. 0 = monotone; higher = more expressive and dynamic." )]
	public int PitchRange   { get => _pitchRange;   set { _pitchRange   = value; MarkCustom(); } }

	int _stressGain = 60;
	[Property, Group( "4. Intonation" ), Range( 0, 100 ), Description( "How strongly stressed syllables stand out from unstressed ones." )]
	public int StressGain   { get => _stressGain;   set { _stressGain   = value; MarkCustom(); } }

	int _intonation = 100;
	[Property, Group( "4. Intonation" ), Range( 0, 200 ), Description( "Overall strength of sentence-level pitch patterns." )]
	public int Intonation   { get => _intonation;   set { _intonation   = value; MarkCustom(); } }

	int _riseAmt = 29;
	[Property, Group( "4. Intonation" ), Range( -100, 100 ), Description( "How much pitch rises at the start of a stressed syllable." )]
	public int RiseAmt      { get => _riseAmt;      set { _riseAmt      = value; MarkCustom(); } }

	int _fallAmt = -29;
	[Property, Group( "4. Intonation" ), Range( -100, 0 ), Description( "How much pitch drops at the end of a stressed syllable." )]
	public int FallAmt      { get => _fallAmt;      set { _fallAmt      = value; MarkCustom(); } }

	int _baselineFall = 51;
	[Property, Group( "4. Intonation" ), Range( 0, 100 ), Description( "How much the overall pitch drifts downward towards the end of a sentence." )]
	public int BaselineFall { get => _baselineFall; set { _baselineFall = value; MarkCustom(); } }


	public event Action<PhonemeEvent> OnPhoneme;


	TtsEngine _engine;
	SoundHandle _handle;
	bool _speaking;
	PhonemeEvent[] _phonemeEvents = Array.Empty<PhonemeEvent>();
	int _nextPhonemeIndex;
	float _speakStartTime = -1f;

	public bool IsSpeaking => _speaking;

	protected override void OnStart()
	{
		try { InitEngine(); }
		catch ( Exception e ) { Log.Error( $"SharpTalkSpeaker: OnStart threw — {e}" ); }
	}

	void InitEngine()
	{
		var dict    = LibraryData.EnglishLex;
		var symbols = LibraryData.Symbols;

		_engine = new TtsEngine( BuildVoice(), dict, symbols );
		Log.Info( $"SharpTalkSpeaker: engine initialized (dict={dict.Length}b, symbols={symbols.Length}b)" );

		if ( !string.IsNullOrWhiteSpace( SpeakOnStart ) )
			_ = Speak( SpeakOnStart );
	}

	VoiceData BuildVoice()
	{
		VoiceData v = _preset switch
		{
			VoicePreset.Whisper => VoiceData.WhisperVoice,
			VoicePreset.Custom  => new VoiceData
			{
				VGain        = (short)VoicingGain,
				AGain        = (short)AspirationGain,
				ACycle       = (short)AspirationCycle,
				TremoloDepth = (short)TremoloDepth,
				TremoloRate  = (short)TremoloRate,
				TractScale   = TractScale,
				F4Freq       = (short)F4Freq,
				F4BW         = (short)F4BW,
				F5Freq       = (short)F5Freq,
				F5BW         = (short)F5BW,
				F4pFreq      = (short)F4pFreq,
				F4pBW        = (short)F4pBW,
				F5pFreq      = (short)F5pFreq,
				F5pBW        = (short)F5pBW,
				F6pFreq      = (short)F6pFreq,
				F6pBW        = (short)F6pBW,
				BwGain1      = (short)BwGain1,
				BwGain2      = (short)BwGain2,
				BwGain3      = (short)BwGain3,
				NasalBase    = (short)NasalBase,
				NasalTarg    = (short)NasalTarg,
				NasalBW      = (short)NasalBW,
				NGain        = (short)NGain,
				PitchRange   = (short)PitchRange,
				StressGain   = (short)StressGain,
				Intonation   = (short)Intonation,
				RiseAmt      = (short)RiseAmt,
				FallAmt      = (short)FallAmt,
				BaselineFall = (short)BaselineFall,
			},
			_ => VoiceData.BaselineVoice,
		};
		v.Rate      = (short)Rate;
		v.PitchHz   = (short)PitchHz;
		v.TractScale = TractScale;
		v.VoiceType = (short)( Female ? 1 : 0 );
		return v;
	}

	public async Task Speak( string text )
	{
		if ( _engine is null ) { Log.Error( "SharpTalkSpeaker: Speak() called but engine is null — was OnStart run?" ); return; }

		Stop();
		_speaking = true;

		short[] samples;
		PhonemeEvent[] events;
		try
		{
			string synText = text;
			if ( KlattschMode )
			{
				string defs = string.Format( System.Globalization.CultureInfo.InvariantCulture,
					"b{0:F0} r{1:F0} v{2:F1} w{3:F1} h{4:F2} t{5:F2} g{6:F2}",
					KlBaseF0, KlRate, KlVibrato, KlVibRate, KlAsp, KlTilt, KlEffort );
				synText = $"[:klattsch on] {defs} {text} [:klattsch off]";
			}

			(samples, events) = await GameTask.RunInThreadAsync( () => _engine.SpeakWithEvents( synText ) );
		}
		catch ( Exception e )
		{
			Log.Error( $"SharpTalkSpeaker: synthesis threw — {e}" );
			_speaking = false;
			return;
		}
		_phonemeEvents = events;
		_nextPhonemeIndex = 0;
		_speakStartTime = -1f;

		using var stream = new SoundStream( _engine.SampleRate, 1 );
		_handle = stream.Play( AudioVolume, 1f );
		_handle.SetParent( GameObject );
		_handle.FollowParent = true;
		_handle.Update();
		_speakStartTime = Time.Now;

		int offset = 0;
		while ( offset < samples.Length )
		{
			int space = stream.MaxWriteSampleCount - stream.QueuedSampleCount;
			if ( space <= 0 ) { await GameTask.Delay( 5 ); continue; }
			int count = Math.Min( space, samples.Length - offset );
			stream.WriteData( samples.AsSpan( offset, count ) );
			offset += count;
		}

		stream.Close();
		_speaking = false;
	}

	protected override void OnUpdate()
	{
		if ( OnPhoneme is null || _nextPhonemeIndex >= _phonemeEvents.Length ) return;
		if ( _speakStartTime < 0f ) return;

		float t = Time.Now - _speakStartTime;
		while ( _nextPhonemeIndex < _phonemeEvents.Length && _phonemeEvents[_nextPhonemeIndex].TimeSeconds <= t )
			OnPhoneme?.Invoke( _phonemeEvents[_nextPhonemeIndex++] );
	}

	public void Stop()
	{
		if ( _handle != null && _handle.IsValid && _handle.IsPlaying )
			_handle.Stop( 0f );
		_speaking = false;
		_speakStartTime = -1f;
		_nextPhonemeIndex = _phonemeEvents.Length;
	}

	public void SetVoice( VoiceData voice )
	{
		if ( _engine is null ) return;
		voice.Rate    = (short)Rate;
		voice.PitchHz = (short)PitchHz;
		voice.TractScale = TractScale;
		_engine.Voice = voice;
	}

	public void ApplyVoice()
	{
		if ( _engine is null ) return;
		var v = _engine.Voice;
		v.Rate    = (short)Rate;
		v.PitchHz = (short)PitchHz;
		v.TractScale = TractScale;
		_engine.Voice = v;
	}
}