s&box

TtsEngine.cs
#nullable enable
using System;
using System.Collections.Generic;

namespace SharpTalk
{

    public readonly struct PhonemeEvent
    {
        public readonly short Phoneme;
        public readonly float TimeSeconds;
        public PhonemeEvent(short phoneme, float timeSeconds) { Phoneme = phoneme; TimeSeconds = timeSeconds; }
    }

    public sealed class TtsEngine
    {
        public const int SampleRate = 22050;

        private readonly Phonemizer _fe;
        private VoiceData _voice;
        private AudioProcessor _be = null!;
        private SpeechRenderer _renderer = null!;
        private FormantSynth _synth = null!;

#if !SANDBOX
#endif

        public TtsEngine(byte[] dictData, byte[] symbolsData)
            : this(VoiceData.BaselineVoice, dictData, symbolsData) { }

        public TtsEngine(VoiceData voice, byte[] dictData, byte[] symbolsData)
        {
            _voice = voice;
            _fe = new Phonemizer(dictData, symbolsData);
            RebuildPipeline();
        }

        public VoiceData Voice
        {
            get => _voice;
            set { _voice = value; RebuildPipeline(); }
        }

        public void ApplyVoice() => RebuildPipeline();

        public short[] Speak(string text)
        {
            var samples = new List<short>();
            Speak(text, buf => samples.AddRange(buf));
            return samples.ToArray();
        }

        public void Speak(string text, Action<short[]> onBuffer)
        {
            foreach (var seg in EmbeddedCmd.ParseSegments(text))
            {
                if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
                if (seg.IsSinging) { ProcessSentence(seg.Singing!.ToArray(), AudioProcessor._Period_, onBuffer, null, ref _dummy); continue; }
                foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
                    ProcessSentence(tokens, endPunct, onBuffer, null, ref _dummy);
            }
        }

        /// <summary>
        /// Like Speak, but also returns a timeline of phoneme events with start times
        /// in seconds relative to the start of the returned audio.
        /// </summary>
        public (short[] audio, PhonemeEvent[] events) SpeakWithEvents(string text)
        {
            var samples = new List<short>();
            var events = new List<PhonemeEvent>();
            int sampleOffset = 0;
            foreach (var seg in EmbeddedCmd.ParseSegments(text))
            {
                if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
                if (seg.IsSinging) { ProcessSentence(seg.Singing!.ToArray(), AudioProcessor._Period_, buf => samples.AddRange(buf), null, ref sampleOffset); continue; }
                foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
                    ProcessSentence(tokens, endPunct, buf => samples.AddRange(buf), events, ref sampleOffset);
            }
            return (samples.ToArray(), events.ToArray());
        }

        // Internal helpers

        static int _dummy;

        void ProcessSentence(PhonemeToken[] tokens, short endPunct, Action<short[]> onBuffer,
                             List<PhonemeEvent>? events, ref int sampleOffset)
        {
            var dump = _be.Process(tokens, endPunct);

            if (events != null)
            {
                int frameOffset = 0;
                for (int i = 0; i < dump.PhonBuf2InIndex; i++)
                {
                    float t = (float)(sampleOffset + frameOffset * FormantSynth.KSampFrameLen) / SampleRate;
                    events.Add(new PhonemeEvent(dump.PhonBuf2[i], t));
                    frameOffset += dump.DurBuf[i];
                }
            }

            var frames = _renderer.Render(dump);
            var audio = new short[frames.Length * FormantSynth.KSampFrameLen];
            int offset = 0;
            foreach (var frame in frames)
            {
                _synth.SynthesizeFrame(frame, audio, offset);
                offset += FormantSynth.KSampFrameLen;
            }
            onBuffer(audio);
            sampleOffset += audio.Length;
        }

        void ApplyCommand(EmbeddedCmd.VoiceCommand cmd)
        {
            switch (cmd.Type)
            {
                case EmbeddedCmd.VoiceCommand.Kind.Rate:
                    _voice.Rate = (short)Math.Clamp(cmd.Value, 40, 600);
                    _be = new AudioProcessor(_voice);
                    break;
                case EmbeddedCmd.VoiceCommand.Kind.Pitch:
                    _voice.PitchHz = (short)Math.Clamp(cmd.Value, 40, 500);
                    _be = new AudioProcessor(_voice);
                    break;
                case EmbeddedCmd.VoiceCommand.Kind.Volume:
                    _voice.VGain = (short)Math.Clamp(cmd.Value, 0, 100);
                    _synth.InvDFT(_voice.VWave, _voice.VWave1, (short)_voice.VGain);
                    break;
            }
        }

        void RebuildPipeline()
        {
            _be = new AudioProcessor(_voice);
            _renderer = new SpeechRenderer(_voice);
            _synth = new FormantSynth();
            _synth.SetVoice(_voice.NGain, true,
                _voice.F4Freq, _voice.F4BW,
                _voice.F4pFreq, _voice.F4pBW,
                _voice.F5pFreq, _voice.F5pBW,
                _voice.F6pFreq, _voice.F6pBW,
                _voice.NasalBase, _voice.NasalBW,
                _voice.AGain, _voice.ACycle);
            _synth.InvDFT(_voice.VWave, _voice.VWave1, (short)_voice.VGain);
        }
    }
}  // namespace