Code/TtsEngine.cs
#nullable enable
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using static SharpTalk.AudioProcessor;
namespace SharpTalk
{
public readonly struct PhonemeEvent
{
public readonly short Phoneme;
public readonly float TimeSeconds;
public PhonemeEvent(short phoneme, float timeSeconds) { Phoneme = phoneme; TimeSeconds = timeSeconds; }
}
public sealed class TtsEngine
{
public const int DefaultSampleRate = 22050;
public int SampleRate { get; private set; }
private readonly Phonemizer _fe;
private VoiceData _voice;
private AudioProcessor _be = null!;
private SpeechRenderer _renderer = null!;
private FormantSynth _synth = null!;
#if !SANDBOX
public TtsEngine() : this(VoiceData.BaselineVoice, DefaultSampleRate) { }
public TtsEngine(VoiceData voice, int sampleRate = DefaultSampleRate)
{
_voice = voice;
SampleRate = sampleRate <= 0 ? DefaultSampleRate : sampleRate;
_fe = new Phonemizer(LibraryData.EnglishLex, LibraryData.Symbols);
RebuildPipeline();
}
#endif
public TtsEngine(byte[] dictData, byte[] symbolsData, int sampleRate = DefaultSampleRate)
: this(VoiceData.BaselineVoice, dictData, symbolsData, sampleRate) { }
public TtsEngine(VoiceData voice, byte[] dictData, byte[] symbolsData, int sampleRate = DefaultSampleRate)
{
_voice = voice;
SampleRate = sampleRate <= 0 ? DefaultSampleRate : sampleRate;
_fe = new Phonemizer(dictData, symbolsData);
RebuildPipeline();
}
public VoiceData Voice
{
get => _voice;
set { _voice = value; RebuildPipeline(); }
}
public (int dict, int morph, int lts) LookupStats
=> (_fe.StatDict, _fe.StatMorph, _fe.StatLts);
public void ResetLookupStats() => _fe.ResetStats();
public DictReader Dict => _fe.Dict;
public void ApplyVoice() => RebuildPipeline();
public short[] Speak(string text)
{
var samples = new List<short>();
Speak(text, buf => samples.AddRange(buf));
return samples.ToArray();
}
public void Speak(string text, Action<short[]> onBuffer)
{
foreach (var seg in EmbeddedCmd.ParseSegments(text))
{
if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
if (seg.IsKlattsch) { ProcessKlattsch(seg.KlattschText!, onBuffer); continue; }
if (seg.IsSinging) { ProcessSentence(seg.Singing!.ToArray(), 0, onBuffer, null, ref _dummy); continue; }
foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
ProcessSentence(tokens, endPunct, onBuffer, null, ref _dummy);
}
}
/// Like Speak, but also returns a timeline of phoneme events with start times
/// in seconds relative to the start of the returned audio.
public (short[] audio, PhonemeEvent[] events) SpeakWithEvents(string text)
{
var samples = new List<short>();
var events = new List<PhonemeEvent>();
int sampleOffset = 0;
foreach (var seg in EmbeddedCmd.ParseSegments(text))
{
if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
if (seg.IsKlattsch)
{
var klattTokens = KlattschParser.CompileToTokens(KlattschParser.Tokenize(seg.KlattschText!));
if (klattTokens.Count > 0)
{
float tMs = (float)sampleOffset * 1000f / SampleRate;
foreach (var tok in klattTokens)
{
if (tok.Phon != _SIL_)
events.Add(new PhonemeEvent(tok.Phon, tMs / 1000f));
tMs += tok.UserDur;
}
var audio = ProcessSentenceToBuffer(klattTokens.ToArray(), 0);
samples.AddRange(audio);
sampleOffset += audio.Length;
}
continue;
}
if (seg.IsSinging) { ProcessSentence(seg.Singing!.ToArray(), 0, buf => samples.AddRange(buf), events, ref sampleOffset); continue; }
foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
ProcessSentence(tokens, endPunct, buf => samples.AddRange(buf), events, ref sampleOffset);
}
return (samples.ToArray(), events.ToArray());
}
// Internal helpers
static int _dummy;
public async Task SpeakAsync(string text, Func<short[], Task> onBuffer, System.Threading.CancellationToken ct = default)
{
foreach (var seg in EmbeddedCmd.ParseSegments(text))
{
ct.ThrowIfCancellationRequested();
if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
if (seg.IsKlattsch)
{
var tokens = KlattschParser.CompileToTokens(KlattschParser.Tokenize(seg.KlattschText!));
if (tokens.Count > 0)
await ProcessSentenceStreaming(tokens.ToArray(), 0, onBuffer, ct);
continue;
}
if (seg.IsSinging)
{
await ProcessSentenceStreaming(seg.Singing!.ToArray(), 0, onBuffer, ct);
continue;
}
foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
{
ct.ThrowIfCancellationRequested();
await ProcessSentenceStreaming(tokens, endPunct, onBuffer, ct);
}
}
}
private void ProcessKlattsch(string text, Action<short[]> onBuffer)
{
var tokens = KlattschParser.CompileToTokens(KlattschParser.Tokenize(text));
if (tokens.Count > 0)
ProcessSentence(tokens.ToArray(), 0, onBuffer, null, ref _dummy);
}
private async Task ProcessSentenceStreaming(PhonemeToken[] tokens, short endPunct, Func<short[], Task> onBuffer, System.Threading.CancellationToken ct)
{
var dump = _be.Process(tokens, endPunct);
await ProcessSentenceStreamingFromDump(dump, onBuffer, ct);
}
private async Task ProcessSentenceStreamingFromDump(SynthInputDump dump, Func<short[], Task> onBuffer, System.Threading.CancellationToken ct)
{
const int framesPerChunk = 10;
var audioChunk = new short[framesPerChunk * _synth.SampFrameLen];
int frameInChunk = 0;
foreach (var frame in _renderer.RenderStreaming(dump))
{
ct.ThrowIfCancellationRequested();
_synth.SynthesizeFrame(frame, audioChunk, frameInChunk * _synth.SampFrameLen);
frameInChunk++;
if (frameInChunk >= framesPerChunk)
{
await onBuffer(audioChunk);
audioChunk = new short[framesPerChunk * _synth.SampFrameLen];
frameInChunk = 0;
await Task.Yield();
}
}
if (frameInChunk > 0)
{
ct.ThrowIfCancellationRequested();
var finalChunk = new short[frameInChunk * _synth.SampFrameLen];
Array.Copy(audioChunk, finalChunk, finalChunk.Length);
await onBuffer(finalChunk);
}
}
// Phase 1, fast (_be.Process per sentence) → collect events + dumps.
// Calls onEventsReady before any audio is rendered so the UI can set up
// tracking while the first frame hasn't been synthesized yet.
// Phase 2, stream formant frames from pre-computed dumps.
public async Task SpeakAsyncWithEvents(
string text,
Func<short[], Task> onBuffer,
Func<List<PhonemeEvent>, Task> onEventsReady,
System.Threading.CancellationToken ct = default)
{
var events = new List<PhonemeEvent>();
var workItems = new List<(SynthInputDump? dump, PhonemeToken[]? klattTokens)>();
int sampleOffset = 0;
foreach (var seg in EmbeddedCmd.ParseSegments(text))
{
ct.ThrowIfCancellationRequested();
if (seg.IsCommand) { ApplyCommand(seg.Cmd!.Value); continue; }
if (seg.IsKlattsch)
{
var tokens = KlattschParser.CompileToTokens(KlattschParser.Tokenize(seg.KlattschText!));
if (tokens.Count == 0) continue;
var dump = _be.Process(tokens.ToArray(), 0);
int frameOff = 0;
for (int i = 0; i < dump.PhonBuf2InIndex; i++)
{
if (dump.PhonBuf2[i] != _SIL_)
events.Add(new PhonemeEvent(dump.PhonBuf2[i],
(float)(sampleOffset + frameOff * _synth.SampFrameLen) / SampleRate));
frameOff += dump.DurBuf[i];
}
sampleOffset += frameOff * _synth.SampFrameLen;
workItems.Add((dump, null));
continue;
}
if (seg.IsSinging)
{
var dump = _be.Process(seg.Singing!.ToArray(), 0);
int frameOff = 0;
for (int i = 0; i < dump.PhonBuf2InIndex; i++)
{
if (dump.PhonBuf2[i] != _SIL_)
events.Add(new PhonemeEvent(dump.PhonBuf2[i],
(float)(sampleOffset + frameOff * _synth.SampFrameLen) / SampleRate));
frameOff += dump.DurBuf[i];
}
sampleOffset += frameOff * _synth.SampFrameLen;
workItems.Add((dump, null));
continue;
}
foreach (var (tokens, endPunct) in _fe.TextToSentenceTokens(seg.PlainText!))
{
var dump = _be.Process(tokens, endPunct);
int frameOff = 0;
for (int i = 0; i < dump.PhonBuf2InIndex; i++)
{
if (dump.PhonBuf2[i] != _SIL_)
events.Add(new PhonemeEvent(dump.PhonBuf2[i],
(float)(sampleOffset + frameOff * _synth.SampFrameLen) / SampleRate));
frameOff += dump.DurBuf[i];
}
sampleOffset += frameOff * _synth.SampFrameLen;
workItems.Add((dump, null));
}
}
await onEventsReady(events);
foreach (var (dump, _) in workItems)
{
ct.ThrowIfCancellationRequested();
await ProcessSentenceStreamingFromDump(dump!, onBuffer, ct);
}
}
private short[] ProcessSentenceToBuffer(PhonemeToken[] tokens, short endPunct)
{
var dump = _be.Process(tokens, endPunct);
var audio = new List<short>();
foreach (var frame in _renderer.RenderStreaming(dump))
{
var frameAudio = new short[_synth.SampFrameLen];
_synth.SynthesizeFrame(frame, frameAudio, 0);
audio.AddRange(frameAudio);
}
return audio.ToArray();
}
void ProcessSentence(PhonemeToken[] tokens, short endPunct, Action<short[]> onBuffer,
List<PhonemeEvent>? events, ref int sampleOffset)
{
var dump = _be.Process(tokens, endPunct);
if (events != null)
{
int frameOffset = 0;
for (int i = 0; i < dump.PhonBuf2InIndex; i++)
{
float t = (float)(sampleOffset + frameOffset * _synth.SampFrameLen) / SampleRate;
events.Add(new PhonemeEvent(dump.PhonBuf2[i], t));
frameOffset += dump.DurBuf[i];
}
}
var audio = ProcessSentenceToBuffer(tokens, endPunct);
onBuffer(audio);
sampleOffset += audio.Length;
}
void ApplyCommand(EmbeddedCmd.VoiceCommand cmd)
{
switch (cmd.Type)
{
case EmbeddedCmd.VoiceCommand.Kind.Rate:
_voice.Rate = (short)Math.Clamp(cmd.Value, 40, 600);
_be = new AudioProcessor(_voice);
break;
case EmbeddedCmd.VoiceCommand.Kind.Pitch:
_voice.PitchHz = (short)Math.Clamp(cmd.Value, 40, 500);
_be = new AudioProcessor(_voice);
break;
case EmbeddedCmd.VoiceCommand.Kind.Volume:
_voice.VGain = (short)Math.Clamp(cmd.Value, 0, 100);
_synth.InvDFT(_voice.VWave, _voice.VWave1, (short)_voice.VGain);
break;
}
}
void RebuildPipeline()
{
_be = new AudioProcessor(_voice);
_renderer = new SpeechRenderer(_voice);
_synth = new FormantSynth(SampleRate);
_synth.SetVoice(_voice.NGain, true,
_voice.F4Freq, _voice.F4BW,
_voice.F5Freq, _voice.F5BW,
_voice.F4pFreq, _voice.F4pBW,
_voice.F5pFreq, _voice.F5pBW,
_voice.F6pFreq, _voice.F6pBW,
_voice.NasalBase, _voice.NasalBW,
_voice.AGain, _voice.ACycle);
_synth.InvDFT(_voice.VWave, _voice.VWave1, (short)_voice.VGain);
}
}
} // namespace