AudioProcessor.cs
#nullable enable
using System;

namespace SharpTalk
{

    public readonly struct PhonemeToken
    {
        public short Phon { get; init; }
        public long Ctrl { get; init; }  // kWord_Start, stress flags, etc.
        public short UserPitch { get; init; }
        public short UserDur { get; init; } // 0 = kDur_One (no scaling)
        public short UserNote { get; init; }
        public short UserRate { get; init; }
    }

    public sealed partial class AudioProcessor
    {
        // Phoneme IDs
        public const short _IY_ = 0; public const short _IH_ = 1;
        public const short _EH_ = 2; public const short _AE_ = 3;
        public const short _AA_ = 4; public const short _AH_ = 5;
        public const short _AO_ = 6; public const short _UH_ = 7;
        public const short _AX_ = 8; public const short _ER_ = 9;
        public const short _EY_ = 10; public const short _AY_ = 11;
        public const short _OY_ = 12; public const short _AW_ = 13;
        public const short _OW_ = 14; public const short _UW_ = 15;
        public const short _YU_ = 16; public const short _IR_ = 17;
        public const short _XR_ = 18; public const short _AR_ = 19;
        public const short _OR_ = 20; public const short _UR_ = 21;
        public const short _IX_ = 22; public const short _SIL_ = 23;
        public const short _RX_ = 24; public const short _LX_ = 25;
        public const short _EL_ = 26; public const short _EN_ = 27;
        public const short _w_ = 28; public const short _y_ = 29;
        public const short _r_ = 30; public const short _l_ = 31;
        public const short _h_ = 32; public const short _m_ = 33;
        public const short _n_ = 34; public const short _NG_ = 35;
        public const short _f_ = 36; public const short _v_ = 37;
        public const short _TH_ = 38; public const short _DH_ = 39;
        public const short _s_ = 40; public const short _z_ = 41;
        public const short _SH_ = 42; public const short _ZH_ = 43;
        public const short _p_ = 44; public const short _b_ = 45;
        public const short _t_ = 46; public const short _d_ = 47;
        public const short _k_ = 48; public const short _g_ = 49;
        public const short _CH_ = 50; public const short _JH_ = 51;
        public const short _TX_ = 52; public const short _DX_ = 53;
        public const short _QX_ = 54; public const short _DD_ = 55;
        public const short _Comma_ = 67;
        public const short _Period_ = 68;
        public const short _Quest_ = 69;
        public const short _Exclam_ = 70;

        // Ctrl-buffer flags (for PhonemeToken.Ctrl)
        public const long kSyllableTypeField = 0x0F;
        public const long kWord_End = 0x0001;
        public const long kPrep_End = 0x0002;
        public const long kVerb_End = 0x0004;
        public const long kTerm_End = 0x0008;
        public const long kWord_Initial_Consonant = 0x0080;
        public const long kSyllableOrderField = 0x0300;
        public const long kFirst_Syllable_In_Word = 0x0100;
        public const long kMid_Syllable_In_Word = 0x0200;
        public const long kLast_Syllable_In_Word = 0x0300;
        public const long kMore_Than_One_Syllable_In_Word = 0x0300;
        public const long kPrimaryStress = 0x0400;
        public const long kSecondaryStress = 0x0800;
        public const long kEmphaticStress = 0x1000;
        public const long kStressField = 0x1C00;
        public const long kIsStressed = 0x1C00;
        public const long kPrimOrEmphStress = 0x1400;
        public const long kContent_Word = 0x2000;
        public const long kBoundryTypeField = 0xF0000L;
        public const long kWord_Start = 0x10000L;
        public const long kPrep_Start = 0x20000L;
        public const long kVerb_Start = 0x40000L;
        public const long kTerm_Bound = 0x80000L;
        public const long kSilenceTypeField = 0x00F00000L;
        public const int kSilenceTypeShift = 20;
        public const long kSilenceDuration = 0x01000000L;
        public const long kSingingDuration = 0x40000000L;
        public const long kSyllable_Start = 0x10000000L;
        public const long kPitchRise = 0x0020L;
        public const long kPitchFall = 0x0040L;
        public const long kPitchRise1 = 0x04000000L;
        public const long kPitchFall1 = 0x08000000L;
        public const long kLowVibrato = 0x10L;
        public const long kNoteDur = 0x0F00L;
        public const int kNoteDurShift = 8;
        public const long kNotePitch = 0x00FFL;
        public const long kCompoundNoun = 0x8000L;
        public const long kStressedWInitial = kIsStressed | kWord_Initial_Consonant;
        public const long kSampleMarker = 0x02000000L;

        // BND types for silence (index into BoundryDurTbl)
        public const int kBND_Pause = 1;
        public const int kBND_Decl = 2;
        public const int kBND_Quest = 3;
        public const int kBND_Emph = 4;

        // Private implementation constants
        private const int kFrameTime = 5;
        private const int kNormalPitch = 323;
        private const int kPhonBufSize = 512;
        private const int kPhonBuf_Red_Zone = kPhonBufSize - 10;
        private const int kPitchBufSize = kPhonBufSize * 6;
        private const int kMaxRamps = 16;
        private const int kStepSizeRes = 3;
        private const int kNeverHappens = -10000;
        private const int kDur_One = 0x100;
        private const int kDurStepRes = 8;
        private const int kNormal_Speech_Rate = 180;
        private const int kMinRate = 40;
        private const int k1pct = 655;
        private const int pct = 655;
        private const int kOneHalf = 0x8000;
        private const int k100percent = 0x10000;
        private const int k100pct_Dur = 128;

        // Hz-based pitch offsets from kNormalPitch
        private const int kHZ_4 = 335 - kNormalPitch;  // 12
        private const int kHZ_6 = 341 - kNormalPitch;  // 18
        private const int kHZ_7 = 344 - kNormalPitch;  // 21
        private const int kHZ_8 = 347 - kNormalPitch;  // 24
        private const int kHZ_9 = 350 - kNormalPitch;  // 27
        private const int kHZ_10 = 352 - kNormalPitch;  // 29
        private const int kHZ_12 = 358 - kNormalPitch;  // 35
        private const int kHZ_14 = 364 - kNormalPitch;  // 41
        private const int kHZ_18 = 374 - kNormalPitch;  // 51
        private const int kHZ_20 = 380 - kNormalPitch;  // 57
        private const int kHZ_25 = 393 - kNormalPitch;  // 70
        private const int kHZ_28 = 400 - kNormalPitch;  // 77

        // Phoneme flags (from phonFlags2 table)
        private const uint kVowelF = 1u << 0;
        private const uint kConsonantF = 1u << 1;
        private const uint kVoicedF = 1u << 2;
        private const uint kVowel1F = 1u << 3;
        private const uint kSonorantF = 1u << 4;
        private const uint kNasalF = 1u << 6;
        private const uint kSonorConsonF = 1u << 8;
        private const uint kPlosFricF = 1u << 10;
        private const uint kStopF = 1u << 12;
        private const uint kGStopF = 1u << 20;
        private const uint kAffricateF = 1u << 24;
        private const uint kVocLiq = 1u << 26;
        private const uint kFric = 1u << 27;

        // Pitch buffer event flags
        private const short kPitchStress_Flg = 0x1;
        private const short kPitchRiseFall_Flg = 0x2;
        private const short kPitchBoundry_Flg = 0x4;
        private const short kResetDecline = 0x8;
        private const short kPhraseReset = 0x10;
        private const short kPitchRiseFall1_Flg = 0x20;

        // Internal buffers
        private readonly short[] _phonBuf1 = new short[kPhonBufSize];
        private readonly long[] _phonCtrlBuf1 = new long[kPhonBufSize];
        private readonly short[] _userPitchBuf1 = new short[kPhonBufSize];
        private readonly short[] _userDurBuf1 = new short[kPhonBufSize];
        private readonly short[] _userNoteBuf1 = new short[kPhonBufSize];
        private readonly short[] _userRateBuf1 = new short[kPhonBufSize];

        private readonly short[] _phonBuf2 = new short[kPhonBufSize];
        private readonly long[] _phonCtrlBuf2 = new long[kPhonBufSize];
        private readonly short[] _userPitchBuf2 = new short[kPhonBufSize];
        private readonly short[] _userDurBuf2 = new short[kPhonBufSize];
        private readonly short[] _userNoteBuf2 = new short[kPhonBufSize];
        private readonly short[] _userRateBuf2 = new short[kPhonBufSize];

        private readonly short[] _durBuf = new short[kPhonBufSize];
        private readonly short[] _pitchBufFreq = new short[kPitchBufSize];
        private readonly short[] _pitchBufTime = new short[kPitchBufSize];
        private readonly short[] _pitchBufFlags = new short[kPitchBufSize];
        private readonly long[] _rampSteps = new long[kMaxRamps];

        // Voice params (set from VoiceData)
        private short _speechRate;
        private long _vpPitchRange;    // 16.16 fixed
        private long _vpStressGain;    // 16.16 fixed
        private short _vpRiseAmt;
        private short _vpFallAmt;
        private short _vpRiseAmt1;
        private short _vpFallAmt1;
        private int _vpAssertiveness; // 16.16 fixed
        private short _vpBaselineFall;
        private int _vpQuickness;
        private short _stressDurTime;   // frames (already >>1 from raw)
        private long _vibratoDepth1;
        private long _vibratoDepth2;
        private long _vibratoFreq;
        private long _vpIntonation;
        private short _voiceNaturalPitch;

        // State computed during pipeline
        private int _phonBuf1InIndex;
        private int _phonBuf2InIndex;
        private int _pitchBufInIndex;
        private int _scanIndex;
        private bool _isCompoundNoun;
        private short _endPunctuation;
        private bool _singing;

        // Rate params
        private long _rateRatio;
        private long _rateRatioLowGain;
        private short _stressDuration;

        // Pitch params
        private short _vpBaselinePitch;
        private short _baselineFallStart;
        private short _baselineFallEnd;
        private long _pFilterOut1;
        private long _pFilterOut2;
        private long _pFilterInGain;
        private long _pFilterFbGain;
        private short _pitchClauseStartTime;
        private short _pitchBoundry;

        // Fill_Pitch_Buf helpers
        private int _pitchTimeOffset;

        // Calc_Ramp_Steps result
        private short _curRamp;

        // StartNew_PitchClause output
        private short _baselineStartOffset;
        private short _baselineEndOffset;

        // Constructor

        public AudioProcessor(VoiceData voice)
        {
            InitFromVoice(voice);
        }

        private void InitFromVoice(VoiceData vd)
        {
            _speechRate = vd.Rate;
            _vpRiseAmt = vd.RiseAmt;
            _vpFallAmt = vd.FallAmt;
            _vpRiseAmt1 = vd.RiseAmt1;
            _vpFallAmt1 = vd.FallAmt1;
            _vpAssertiveness = vd.Assertiveness;
            _vpBaselineFall = vd.BaselineFall;
            _vpQuickness = vd.Quickness;
            _stressDurTime = vd.StressDurTime;
            _vpPitchRange = ((long)vd.PitchRange << 16) / 100;
            _vpStressGain = ((long)vd.StressGain << 16) / 100;
            _vibratoDepth1 = ((long)vd.VibratoDepth1Raw << 16) / 1000;
            _vibratoDepth2 = ((long)vd.VibratoDepth2Raw << 16) / 1000;
            long vf = ((long)vd.VibratoFreqRaw << 16) / 10;
            _vibratoFreq = (vf * 256) / 200;
            _vpIntonation = ((long)vd.Intonation << 16) / 100;
            _voiceNaturalPitch = HzToPitch(vd.PitchHz);
        }

        // Public entry point

        public SynthInputDump Process(PhonemeToken[] tokens, short endPunctuation = _Period_)
        {
            _endPunctuation = endPunctuation;
            _singing = false;

            ClearBuffers();
            InitRateParams();
            InitPitchParams();

            LoadPhonemes(tokens);
            FlagPhonBuf1();
            FillPhonBuf2();
            PitchRaiseAndFall();
            ModDuration();
            CalcRampSteps();
            FillPitchBuf();
            StartNewPitchClause();
            InsertPlosiveRelease();

            return BuildSynthInputDump();
        }

        // Pipeline setup helpers

        private void ClearBuffers()
        {
            for (int i = 0; i < kPhonBufSize; i++)
            {
                _phonBuf1[i] = _SIL_;
                _phonBuf2[i] = _SIL_;
                _phonCtrlBuf1[i] = 0;
                _phonCtrlBuf2[i] = 0;
                _userDurBuf1[i] = kDur_One;
                _userDurBuf2[i] = kDur_One;
                _userPitchBuf1[i] = 0;
                _userNoteBuf1[i] = 0;
                _userRateBuf1[i] = 0;
            }
        }

        private void InitRateParams()
        {
            if (_speechRate < kMinRate) _speechRate = kMinRate;
            _rateRatio = ((long)kNormal_Speech_Rate << 16) / _speechRate;
            long denominator = (((_speechRate - kNormal_Speech_Rate) * (long)(k1pct * 60)) >> 16) + kNormal_Speech_Rate;
            _rateRatioLowGain = ((long)kNormal_Speech_Rate << 16) / denominator;
            _stressDuration = (short)((_rateRatio * _stressDurTime) >> 16);
        }

        private void InitPitchParams()
        {
            _vpBaselinePitch = _voiceNaturalPitch;
            _baselineFallStart = (short)(kHZ_7 + _vpBaselineFall);
            _baselineFallEnd = (short)(kHZ_7 - _vpBaselineFall);
            _pFilterOut1 = (long)_baselineFallStart << kStepSizeRes;
            _pFilterOut2 = _pFilterOut1;
            _pFilterInGain = _vpQuickness;
            _pFilterFbGain = k100percent - _vpQuickness;
            _pitchClauseStartTime = (short)(10 / kFrameTime);
            _pitchBoundry = kNeverHappens;
        }

        private static short HzToPitch(short hz)
        {
            const int ratioK = 2621;
            if (hz <= 0) return 0;
            long freq, fk;
            if (hz < 100) { freq = hz << 3; fk = 0x000; }
            else if (hz < 200) { freq = hz << 2; fk = 0x100; }
            else if (hz < 400) { freq = hz << 1; fk = 0x200; }
            else if (hz < 800) { freq = hz; fk = 0x300; }
            else if (hz < 1600) { freq = hz >> 1; fk = 0x400; }
            else if (hz < 3200) { freq = hz >> 2; fk = 0x500; }
            else { freq = hz >> 3; fk = 0x600; }
            long ratio = ((freq - 400) * ratioK) >> 11;
            if (ratio < 0) ratio = 0;
            if (ratio >= Tables.logOf2Tbl.Length) ratio = Tables.logOf2Tbl.Length - 1;
            return (short)(Tables.logOf2Tbl[ratio] + fk);
        }

        // BuildSynthInputDump

        private SynthInputDump BuildSynthInputDump()
        {
            int count = _phonBuf2InIndex + 1; // +1 for lookahead SIL slot

            short[] phonBuf2 = new short[count];
            long[] controls = new long[count];
            short[] durBuf = new short[count];
            short[] userPitchBuf2 = new short[count];
            short[] userNoteBuf2 = new short[count];

            for (int i = 0; i < count; i++)
            {
                phonBuf2[i] = _phonBuf2[i];
                controls[i] = _phonCtrlBuf2[i];
                durBuf[i] = _durBuf[i];
                userPitchBuf2[i] = _userPitchBuf2[i];
                userNoteBuf2[i] = _userNoteBuf2[i];
            }

            int pitchCount = _pitchBufInIndex + 1;
            short[] pitchFreq = new short[pitchCount];
            short[] pitchTime = new short[pitchCount];
            short[] pitchFlags = new short[pitchCount];
            for (int i = 0; i < pitchCount; i++)
            {
                pitchFreq[i] = _pitchBufFreq[i];
                pitchTime[i] = _pitchBufTime[i];
                pitchFlags[i] = _pitchBufFlags[i];
            }

            long[] rampStepsCopy = new long[kMaxRamps];
            Array.Copy(_rampSteps, rampStepsCopy, kMaxRamps);

            var pitch = new PitchState
            {
                NextPitchBufTime = pitchCount > 0 ? pitchTime[0] : (short)0,
                PitchBufOutIndex = 0,
                CurPitchBufTime = (short)(_pitchClauseStartTime >> 1),
                CurPitchBufPitch = 0,
                CurPitchBufFlags = 0,

                PhonIndexTarg = -1,
                PhonIndexCp = -1,
                TimeIntoPhonTarg = _pitchClauseStartTime,
                TimeIntoPhonCp = 0,
                CurPhonDurCc = 0,
                CurPhonDurCp = 0,
                PhonDurDelay = 0,

                UvPhonPitchTarg = 0,
                PhonPitchOffset = 0,
                PhonPitchOffset1 = 0,

                FallRiseOffset = 0,
                FallRise1Offset = 0,
                StressTarget = 0,
                PunctOffset = 0,
                StressActiveTime = 0,
                StressDuration = _stressDuration,

                BaseLineOffset = 0,
                BasePitchOffset = 0,
                PitchBoundry = (short)_pitchBoundry,
                LowGainCp = 0,

                BaselineFallStart = _baselineFallStart,
                BaselineFallEnd = _baselineFallEnd,
                BaselineStartOffset = _baselineStartOffset,
                BaselineEndOffset = _baselineEndOffset,

                DownRampOffset = 0,
                DownRampStep = _rampSteps.Length > 0 ? _rampSteps[0] : 0,
                RampSteps = rampStepsCopy,
                CurRamp = _curRamp,

                PFilterOut1 = _pFilterOut1,
                PFilterOut2 = _pFilterOut2,
                PFilterInGain = _pFilterInGain,
                PFilterFbGain = _pFilterFbGain,

                VpIntonation = _vpIntonation,
                VpPitchRange = _vpPitchRange,
                VpBaselinePitch = _vpBaselinePitch,

                VibratoDepth1 = _vibratoDepth1,
                VibratoDepth2 = _vibratoDepth2,
                VibratoFreq = _vibratoFreq,
                VibratoPhase1 = 0,

                Singing = (short)(_singing ? 1 : 0),
                HzGlide = 0,
                MusicalNoteActive = 0,
                PortamentoAccum = 0,
                PortamentoStep = 0,
                NewPortaTarget = 0,
                NewSentence = 1,
                SpeechRate = _speechRate,
            };

            return SynthInputDump.Create(
                phonBuf2InIndex: _phonBuf2InIndex,
                phonBuf2: phonBuf2,
                controls: controls,
                durBuf: durBuf,
                userPitchBuf2: userPitchBuf2,
                userNoteBuf2: userNoteBuf2,
                pitchBufInIndex: (uint)_pitchBufInIndex,
                pitchBufFreq: pitchFreq,
                pitchBufTime: pitchTime,
                pitchBufFlags: pitchFlags,
                pitch: pitch
            );
        }

        // Inline helpers

        private short GetPhon2(int i)
        {
            if (i < 0 || i >= _phonBuf2InIndex) return _SIL_;
            return _phonBuf2[i];
        }

        private long GetCtrl2(int i)
        {
            if (i < 0 || i >= _phonBuf2InIndex) return 0;
            return _phonCtrlBuf2[i];
        }

        private uint GetPhonFlags1(int i)
        {
            if (i < 0 || i >= _phonBuf1InIndex) return 0;
            short p = _phonBuf1[i];
            if (p < 0 || p >= Tables.PhonFlags2.Length) return 0;
            return Tables.PhonFlags2[p];
        }
    }
}  // namespace