Code/FormantSynth.cs
#nullable enable
using System;
namespace SharpTalk
{
public class FormantSynth
{
public const int KMaxBandWidth = 1225;
public const int KPrecision = 13;
public const int KOnePtOh = 0x2000;
public const int KNoiseGain = 3200;
public const int KDefaultSampleRate = 22050;
public const int KDefaultSampFrameLen = 112;
private readonly int _sampleRate;
private readonly int _internalRate;
public int SampleRate => _sampleRate;
public int SampFrameLen { get; }
// Filter coefficients
private float Acoeff1, Bcoeff1, Ccoeff1;
private float Acoeff2, Bcoeff2, Ccoeff2;
private float Acoeff3, Bcoeff3, Ccoeff3;
private float Acoeff4, Bcoeff4, Ccoeff4;
private float Acoeff5c, Bcoeff5c, Ccoeff5c;
private float Acoeff4p, Bcoeff4p, Ccoeff4p;
private float Acoeff5, Bcoeff5, Ccoeff5;
private float Acoeff6, Bcoeff6, Ccoeff6;
private float AcoeffNZ, BcoeffNZ, CcoeffNZ;
private float AcoeffNP, BcoeffNP, CcoeffNP;
// IIR delay taps
private float Na1, Nb1;
private float Na2, Nb2;
private float Na3, Nb3;
private float Na4, Nb4;
private float Na5c, Nb5c;
private float Na5, Nb5;
private float Na6, Nb6;
private float Na2a, Nb2a;
private float Na3a, Nb3a;
private float Na4a, Nb4a;
private float NaNZ, NbNZ;
private float NaNP, NbNP;
// Parallel bank input gains
private float amp2, amp3, amp4, amp5, amp6, ab;
// State
private int glotIndex;
private float lastnSamp;
private float curAmp;
private float lastSample;
private float ampStep;
private float lastAmp;
// Glottal excitation
private int glotInc;
private int glotInc1;
private int glotIndex1;
private short[] voiceWaveform = new short[256];
private short[] voiceWaveform1 = new short[256];
public short VoiceChorus { get; set; }
public int GlotType { get; set; } = KUseHarm;
public byte[]? SampleWave { get; set; }
public int SampleInc { get; set; }
public int SampleIndex { get; set; }
// Klattsch DSP state
private float _vibratoPhase;
private float _tremoloPhase;
private float _tiltPrev;
private float _lastVibDepth, _lastVibRate;
private float _lastTremDepth, _lastTremRate;
private float _lastAsp, _lastTilt;
public const int KUseHarm = 0;
public const int KUseSnd = 1;
public const int KUseSyncSnd = 2;
// Noise excitation
private int _noiseSeed = 0x12345;
private int NextNoise()
{
_noiseSeed = (_noiseSeed * 1103515245 + 12345) & 0x7FFFFFFF;
return (_noiseSeed >> 16) & 0xFF;
}
// Gain
private float _noiseScale;
private float Av, Af;
private float wavesampleGain = 0;
private float voiceNoiseGain;
private bool hfEmph = true;
private float speechVolume = 150;
private float setNoiseGain = 3200;
private short voiceF1Gain = 0, voiceF2Gain = 0, voiceF3Gain = 0;
private short nasalAmt = 0;
private short fNP;
private short bNP;
private float breathGain;
private short breathCycle;
private short voiceMinBW = 50;
// Parallel F-bank params (from voiceData)
private short f4_Par;
private short bw4_Par;
private short f5_Par;
private short bw5_Par;
private short f6_Par;
private short bw6_Par;
private short voice_F4_Freq;
private short voice_F4_BW;
private short voice_F5_Freq;
private short voice_F5_BW;
public FormantSynth(int sampleRate = KDefaultSampleRate)
{
_sampleRate = sampleRate <= 0 ? KDefaultSampleRate : sampleRate;
_internalRate = _sampleRate;
int len = (int)Math.Round(_sampleRate * (KDefaultSampFrameLen / (double)KDefaultSampleRate), MidpointRounding.AwayFromZero);
if (len < 2) len = 2;
SampFrameLen = len;
_noiseScale = MathF.Sqrt(_sampleRate / (float)KDefaultSampleRate);
maxRvbDelay = 4096;
delayBuffer = new short[maxRvbDelay];
tapBuffer[0] = 404;
tapBuffer[1] = 1058;
tapBuffer[2] = 1362;
tapBuffer[3] = 2318;
tapBuffer[4] = 2909;
tapBuffer[5] = 3723;
tapBuffer[6] = 4030;
tapBuffer[7] = 4096;
}
public void SetVoice(short nGain, bool bit16, short f4_Freq, short f4_BW, short f5_Freq, short f5_BW, short f4p_Freq, short bw4p_BW, short f5p_Freq, short bw5p_BW, short f6p_Freq, short bw6p_BW, short nasal_Base, short nasal_BW, short aGain = 0, short aCycle = 192)
{
breathGain = (aGain * KNoiseGain) / 100.0f;
breathCycle = aCycle;
voiceNoiseGain = (nGain / 100.0f);
if (bit16)
{
voiceNoiseGain *= (0xCCCC / 65536.0f);
}
setNoiseGain = voiceNoiseGain;
voice_F4_Freq = HzToPitch(f4_Freq);
voice_F4_BW = f4_BW;
voice_F5_Freq = HzToPitch(f5_Freq);
voice_F5_BW = f5_BW;
f4_Par = HzToPitch(f4p_Freq);
bw4_Par = bw4p_BW;
f5_Par = HzToPitch(f5p_Freq);
bw5_Par = bw5p_BW;
f6_Par = HzToPitch(f6p_Freq);
bw6_Par = bw6p_BW;
fNP = HzToPitch(nasal_Base);
bNP = nasal_BW;
InitFixedFormants();
}
private void InitFixedFormants()
{
Calc_Pole_Coefficients(out Acoeff4, out Bcoeff4, out Ccoeff4, voice_F4_Freq, voice_F4_BW);
Calc_Pole_Coefficients(out Acoeff5c, out Bcoeff5c, out Ccoeff5c, voice_F5_Freq, voice_F5_BW);
Calc_Pole_Coefficients(out Acoeff4p, out Bcoeff4p, out Ccoeff4p, f4_Par, bw4_Par);
Acoeff4p *= (KNoiseGain / 8192.0f);
Calc_Pole_Coefficients(out Acoeff5, out Bcoeff5, out Ccoeff5, f5_Par, bw5_Par);
Acoeff5 *= (KNoiseGain / 8192.0f);
Calc_Pole_Coefficients(out Acoeff6, out Bcoeff6, out Ccoeff6, f6_Par, bw6_Par);
Acoeff6 *= (KNoiseGain / 8192.0f);
Calc_Pole_Coefficients(out AcoeffNP, out BcoeffNP, out CcoeffNP, fNP, bNP);
}
// Reverb state
private const int KNumOfTaps = 8;
private short[] tapBuffer = new short[KNumOfTaps];
private short[] delayBuffer;
private int maxRvbDelay;
private int delay_Index;
private long lastRevbSample;
public void Calc_Pole_Coefficients(out float Acoeff, out float Bcoeff, out float Ccoeff, short pitch, short bandWidth, int voiceMinBW = 50)
{
if (bandWidth > KMaxBandWidth) bandWidth = (short)KMaxBandWidth;
if (bandWidth < voiceMinBW) bandWidth = (short)voiceMinBW;
if (pitch < 256) pitch = 256;
float hz = PitchToHz(pitch);
float r = (float)Math.Exp(-Math.PI * bandWidth / _internalRate);
float w = (float)(2.0 * Math.PI * hz / _internalRate);
Ccoeff = -(r * r);
Bcoeff = 2.0f * r * (float)Math.Cos(w);
Acoeff = 1.0f - Bcoeff - Ccoeff;
}
public void Calc_Zero_Coefficients(out float Acoeff, out float Bcoeff, out float Ccoeff, short pitch, short bandWidth)
{
if (bandWidth > KMaxBandWidth) bandWidth = (short)KMaxBandWidth;
if (pitch < 256) pitch = 256;
float hz = PitchToHz(pitch);
float r = (float)Math.Exp(-Math.PI * bandWidth / _internalRate);
float w = (float)(2.0 * Math.PI * hz / _internalRate);
Ccoeff = r * r;
Bcoeff = -2.0f * r * (float)Math.Cos(w);
Acoeff = 1.0f + Bcoeff + Ccoeff;
}
public void InvDFT(short[] vWave, short[] vWave1, short vGain)
{
if (vWave == null || vWave1 == null)
{
for (int j = 0; j < 256; j++)
{
voiceWaveform[j] = 0;
voiceWaveform1[j] = 0;
}
return;
}
var w0 = new float[256];
var w1 = new float[256];
float gain = vGain / 200.0f;
for (int i = 0; i < 48; i++)
{
float amp = vWave[i] * gain;
float amp1 = vWave1[i] * gain;
int sIndex = 0;
for (int j = 0; j < 256; j++)
{
// Match prior fixed-point scaling:
// sine15 ~= round(16383*sin(..)); sample += (amp*sine15)>>16
float sine15 = 16383.0f * (float)Math.Sin(2.0 * Math.PI * sIndex / 256.0);
w0[j] += (amp * sine15) / 65536.0f;
w1[j] += (amp1 * sine15) / 65536.0f;
sIndex = (sIndex + i) & 0xFF;
}
}
float max = 0, max1 = 0;
for (int j = 0; j < 256; j++)
{
float a0 = Math.Abs(w0[j]);
if (a0 > max) max = a0;
float a1 = Math.Abs(w1[j]);
if (a1 > max1) max1 = a1;
}
// Match prior behavior: only scale the chorus waveform to match the primary waveform's peak.
float chorusScale = (max1 > 0) ? (max / max1) : 0;
for (int j = 0; j < 256; j++)
{
voiceWaveform[j] = (short)Math.Clamp(MathF.Round(w0[j]), short.MinValue, short.MaxValue);
voiceWaveform1[j] = (short)Math.Clamp(MathF.Round(w1[j] * chorusScale), short.MinValue, short.MaxValue);
}
}
public void SynthesizeFrame(Frame frame, short[] outputBuffer, int offset)
{
if ((curAmp == 0) && (Af == 0))
{
glotIndex = 0;
glotIndex1 = 0;
Na1 = Nb1 = Na2 = Nb2 = Na3 = Nb3 = Na4 = Nb4 = Na5c = Nb5c = 0;
NaNP = NbNP = NaNZ = NbNZ = 0;
lastAmp = 0;
}
Calc_Pole_Coefficients(out Acoeff1, out Bcoeff1, out Ccoeff1, (short)(frame.F1 + voiceF1Gain), frame.Bw1);
Calc_Pole_Coefficients(out Acoeff2, out Bcoeff2, out Ccoeff2, (short)(frame.F2 + voiceF2Gain), frame.Bw2);
Calc_Pole_Coefficients(out Acoeff3, out Bcoeff3, out Ccoeff3, (short)(frame.F3 + voiceF3Gain), frame.Bw3);
bool noNasal;
float nGain = 0;
if (frame.FNZ != fNP)
{
noNasal = false;
Calc_Zero_Coefficients(out AcoeffNZ, out BcoeffNZ, out CcoeffNZ, (short)(frame.FNZ + nasalAmt), bNP);
nGain = AcoeffNZ != 0 ? (AcoeffNP / AcoeffNZ) : 0;
}
else
{
noNasal = true;
}
bool ampBank = false;
short rawAv = frame.Av;
Av = rawAv * speechVolume;
Af = frame.Af * speechVolume * 4.0f;
ab = frame.AB * speechVolume;
if (Af > 0 || ab > 0) ampBank = true;
float totalBreathGain = (breathGain * Av) / 8192.0f;
float Acoeff2q = 0, Acoeff3q = 0, Acoeff4q = 0, Acoeff5q = 0, Acoeff6q = 0;
if (frame.A2 > 0) { amp2 = frame.A2 / 32.0f; Acoeff2q = Acoeff2 * amp2; ampBank = true; }
else { amp2 = 0; Nb2a = 0; Na2a = 0; }
if (frame.A3 > 0) { amp3 = frame.A3 / 32.0f; Acoeff3q = Acoeff3 * amp3; ampBank = true; }
else { amp3 = 0; Nb3a = 0; Na3a = 0; }
if (frame.A4 > 0) { amp4 = frame.A4 / 32.0f; Acoeff4q = Acoeff4p * amp4; ampBank = true; }
else { amp4 = 0; Nb4a = 0; Na4a = 0; }
if (frame.A5 > 0) { amp5 = frame.A5 / 32.0f; Acoeff5q = Acoeff5 * amp5; ampBank = true; }
else { amp5 = 0; Nb5 = 0; Na5 = 0; }
if (frame.A6 > 0) { amp6 = frame.A6 / 32.0f; Acoeff6q = Acoeff6 * amp6; ampBank = true; }
else { amp6 = 0; Nb6 = 0; Na6 = 0; }
glotInc = (int)Math.Round(PitchToHz(frame.F0) * (double)(1 << 24) / _internalRate, MidpointRounding.AwayFromZero);
if (VoiceChorus != 0)
{
int curF0Pitch = frame.F0 + VoiceChorus;
if (curF0Pitch < 0) curF0Pitch = 0;
glotInc1 = (int)Math.Round(PitchToHz((short)curF0Pitch) * (double)(1 << 24) / _internalRate, MidpointRounding.AwayFromZero);
}
// Klattsch interpolation steps
float targetVibDepth = frame.VibDepth;
float targetVibRate = frame.VibRate / 10.0f;
float targetTremDepth = frame.TremDepth / 100.0f;
float targetTremRate = frame.TremRate / 10.0f;
float targetAsp = frame.Aspiration / 100.0f;
float targetTilt = (frame.Tilt / 100.0f) * 1.9f - 0.95f; // Map 0..100 to -0.95..0.95
float vibDStep = (targetVibDepth - _lastVibDepth) / SampFrameLen;
float vibRStep = (targetVibRate - _lastVibRate) / SampFrameLen;
float tremDStep = (targetTremDepth - _lastTremDepth) / SampFrameLen;
float tremRStep = (targetTremRate - _lastTremRate) / SampFrameLen;
float aspStep = (targetAsp - _lastAsp) / SampFrameLen;
float tiltStep = (targetTilt - _lastTilt) / SampFrameLen;
ampStep = (Av - lastAmp) / 8.0f;
curAmp = lastAmp;
lastAmp = Av;
int local_ampCtr = 0;
for (int sampCtr = SampFrameLen - 1; sampCtr >= 0; --sampCtr)
{
if (local_ampCtr < 8) { curAmp += ampStep; local_ampCtr++; }
else { curAmp = Av; }
// Step Klattsch params
_lastVibDepth += vibDStep;
_lastVibRate += vibRStep;
_lastTremDepth += tremDStep;
_lastTremRate += tremRStep;
_lastAsp += aspStep;
_lastTilt += tiltStep;
// Vibrato modulation
_vibratoPhase += (float)(2 * Math.PI * _lastVibRate / _sampleRate);
if (_vibratoPhase > (float)(2 * Math.PI)) _vibratoPhase -= (float)(2 * Math.PI);
float effF0 = PitchToHz(frame.F0) + _lastVibDepth * MathF.Sin(_vibratoPhase);
glotInc = (int)Math.Round(effF0 * (double)(1 << 24) / _internalRate);
if (VoiceChorus != 0)
{
float effF0_1 = PitchToHz((short)(frame.F0 + VoiceChorus)) + _lastVibDepth * MathF.Sin(_vibratoPhase);
glotInc1 = (int)Math.Round(effF0_1 * (double)(1 << 24) / _internalRate);
}
// Tremolo modulation
_tremoloPhase += (float)(2 * Math.PI * _lastTremRate / _sampleRate);
if (_tremoloPhase > (float)(2 * Math.PI)) _tremoloPhase -= (float)(2 * Math.PI);
float tremMod = 1.0f - _lastTremDepth * (0.5f + 0.5f * MathF.Sin(_tremoloPhase));
float localAmp = curAmp * tremMod;
float sourceC = 0, SampV = 0, sourceP = 0, SampAB = 0, Samp2 = 0, Samp3 = 0, Samp4 = 0, Samp5 = 0, Samp6 = 0;
if (localAmp > 0 || ampBank || totalBreathGain > 0 || _lastAsp > 0)
{
if (localAmp > 0)
{
float vPulse;
if (GlotType == KUseHarm)
{
glotIndex = (glotInc + glotIndex) & 0xFFFFFF;
vPulse = voiceWaveform[glotIndex >> 16];
if (VoiceChorus != 0)
{
glotIndex1 = (glotInc1 + glotIndex1) & 0xFFFFFF;
vPulse = (vPulse + voiceWaveform1[glotIndex1 >> 16]) * 0.5f;
}
}
else
{
glotIndex = (glotInc + glotIndex) & 0xFFFFFF;
if (SampleWave != null)
{
SampleIndex = (SampleInc + SampleIndex) & 0xFFFFFF;
vPulse = (SampleWave[SampleIndex >> 16] - 128) * wavesampleGain;
}
else vPulse = 0;
}
// Apply spectral tilt to voiced source
float tilted = vPulse - _lastTilt * _tiltPrev;
_tiltPrev = vPulse;
vPulse = tilted;
sourceC = vPulse * localAmp / 8192.0f;
}
else
{
// No voicing, but still advance glotIndex for breathCycle gating
if (totalBreathGain > 0) glotIndex = (glotInc + glotIndex) & 0xFFFFFF;
else { lastnSamp = 0; glotIndex = 0; glotIndex1 = 0; }
sourceC = 0;
}
// Breath (aspiration) source — injected when cycle position exceeds breathCycle
if (totalBreathGain > 0 && (glotIndex >> 16) > breathCycle)
sourceC += (NextNoise() - 128) * totalBreathGain * _noiseScale / 2048.0f;
// Klattsch aspiration (continuous)
if (_lastAsp > 0)
{
float aspGain = _lastAsp * localAmp * 0.5f;
sourceC += (NextNoise() - 128) * aspGain * _noiseScale / 8192.0f;
}
if (localAmp > 0 || Af > 0 || totalBreathGain > 0 || _lastAsp > 0)
{
sourceC += (NextNoise() - 128) * Af * _noiseScale / 8192.0f;
if (noNasal) SampV = sourceC;
else
{
SampV = sourceC + (BcoeffNZ * NaNZ) + (CcoeffNZ * NbNZ);
NbNZ = NaNZ; NaNZ = sourceC;
SampV *= nGain;
SampV = SampV + (BcoeffNP * NaNP) + (CcoeffNP * NbNP);
NbNP = NaNP; NaNP = SampV;
}
SampV = (Acoeff1 * SampV) + (Bcoeff1 * Na1) + (Ccoeff1 * Nb1);
Nb1 = Na1; Na1 = SampV;
SampV = (Acoeff2 * SampV) + (Bcoeff2 * Na2) + (Ccoeff2 * Nb2);
Nb2 = Na2; Na2 = SampV;
SampV = (Acoeff3 * SampV) + (Bcoeff3 * Na3) + (Ccoeff3 * Nb3);
Nb3 = Na3; Na3 = SampV;
SampV = (Acoeff4 * SampV) + (Bcoeff4 * Na4) + (Ccoeff4 * Nb4);
Nb4 = Na4; Na4 = SampV;
SampV = (Acoeff5c * SampV) + (Bcoeff5c * Na5c) + (Ccoeff5c * Nb5c);
Nb5c = Na5c; Na5c = SampV;
}
// Match prior fixed-point scaling: sourceP = MMul2(noise, voiceNoiseGain, KPrecision)
sourceP = (NextNoise() - 128) * voiceNoiseGain * _noiseScale;
if (ab > 0) SampAB = sourceP * ab / 4096.0f;
if (amp2 > 0) { Samp2 = (Acoeff2q * sourceP) + (Bcoeff2 * Na2a) + (Ccoeff2 * Nb2a); Nb2a = Na2a; Na2a = Samp2; }
if (amp3 > 0) { Samp3 = (Acoeff3q * sourceP) + (Bcoeff3 * Na3a) + (Ccoeff3 * Nb3a); Nb3a = Na3a; Na3a = Samp3; }
if (amp4 > 0) { Samp4 = (Acoeff4q * sourceP) + (Bcoeff4p * Na4a) + (Ccoeff4p * Nb4a); Nb4a = Na4a; Na4a = Samp4; }
if (amp5 > 0) { Samp5 = (Acoeff5q * sourceP) + (Bcoeff5 * Na5) + (Ccoeff5 * Nb5); Nb5 = Na5; Na5 = Samp5; }
if (amp6 > 0) { Samp6 = (Acoeff6q * sourceP) + (Bcoeff6 * Na6) + (Ccoeff6 * Nb6); Nb6 = Na6; Na6 = Samp6; }
float nSamp = SampV + (SampAB - Samp3 + Samp4 - Samp5 + Samp6 - Samp2);
if (hfEmph)
{
nSamp += (nSamp * 0.25f);
float tSamp = nSamp - (lastSample - (lastSample * 0.25f));
lastSample = nSamp;
nSamp = tSamp + (nSamp * 0.5f);
}
nSamp = Math.Clamp(nSamp, -8191.0f, 8191.0f);
outputBuffer[offset++] = (short)Math.Clamp(MathF.Round(nSamp * 4.0f), short.MinValue, short.MaxValue);
lastnSamp = nSamp;
}
else
{
lastnSamp = 0; glotIndex = 0; glotIndex1 = 0;
outputBuffer[offset++] = 0;
}
}
}
public static short HzToPitch(short hz)
{
const int ratioK = 2621;
int fk, freq;
if (hz <= 0) return 0;
if (hz < 100) { freq = hz << 3; fk = 0x0; }
else if (hz < 200) { freq = hz << 2; fk = 0x100; }
else if (hz < 400) { freq = hz << 1; fk = 0x200; }
else if (hz < 800) { freq = hz; fk = 0x300; }
else if (hz < 1600) { freq = hz >> 1; fk = 0x400; }
else if (hz < 3200) { freq = hz >> 2; fk = 0x500; }
else { freq = hz >> 3; fk = 0x600; }
int ratio = ((freq - 400) * ratioK) >> 11;
if (ratio < 0) ratio = 0;
if (ratio > 511) ratio = 511;
// Runtime logOf2Tbl replacement: floor(256*log2(1 + ratio/512))
int log = (int)(256.0 * Math.Log(1.0 + (ratio / 512.0), 2.0));
return (short)(log + fk);
}
public static short PitchToHz(short pitch)
{
// Runtime OctFreqTbl + ExpOf2Tbl replacement:
// OctFreqTbl[oct] = 50<<oct, ExpOf2Tbl[i] = round(32768*2^(i/256))
int oct = (pitch & 0xF00) >> 8;
int frac = pitch & 0xFF;
int baseFreq = 50 << oct;
int exp = (int)Math.Round(32768.0 * Math.Pow(2.0, frac / 256.0), MidpointRounding.AwayFromZero);
return (short)((baseFreq * exp) >> 15);
}
}
public struct Frame
{
public short Av;
public short Af;
public short F0;
public short F1;
public short F2;
public short F3;
public short A2;
public short A3;
public short A4;
public short A5;
public short A6;
public short FNZ;
public short AB;
public short Bw1;
public short Bw2;
public short Bw3;
public short PhonEdge;
public long Marker;
// Klattsch parameters
public byte Aspiration;
public byte Tilt;
public byte Effort;
public byte VibDepth;
public byte VibRate;
public byte TremDepth;
public byte TremRate;
}
}