Code/PitchInterpolator.cs
#nullable enable
using System;

namespace SharpTalk
{

    public sealed class PitchInterpolator
    {
        private readonly SynthInputDump _dump;

        // Mutable state mirroring vv fields touched by Interpolate_Pitch / Phon_Boundry_Pitch
        private short _nextPitchBufTime;
        private int _pitchBufOutIndex;
        private int _curPitchBufTime;
        private short _curPitchBufPitch;
        private short _curPitchBufFlags;

        private int _phonIndexTarg;
        private int _timeIntoPhonTarg;
        private int _curPhonDurCc;
        private int _phonDurDelay;

        private int _phonIndexCp;
        private int _timeIntoPhonCp;
        private int _curPhonDurCp;

        private int _uvPhonPitchTarg;
        private int _phonPitchOffset1;

        private int _fallRiseOffset;
        private int _fallRise1Offset;
        private int _stressTarget;
        private int _punctOffset;
        private int _stressActiveTime;

        private int _baselineStartOffset;
        private int _baselineEndOffset;
        private long _downRampOffset;
        private long _downRampStep;
        private long[] _rampSteps;
        private int _curRamp;

        private long _pFilterOut1;
        private long _pFilterOut2;
        private long _pFilterInGain;
        private long _pFilterFbGain;

        private long _vpIntonation;
        private long _vpPitchRange;
        private int _vpBaselinePitch;

        private long _vibratoDepth1;
        private long _vibratoDepth2;
        private long _vibratoFreq;
        private int _vibratoPhase1;

        private bool _singing;
        private bool _hzGlide;
        private bool _musicalNoteActive;
        private long _portamentoAccum;
        private long _portamentoStep;
        private bool _newPortaTarget;
        private bool _newSentence;
        private int _speechRate;

        private int _pitchBoundry;
        private bool _lowGainCp;
        private int _pbHold;
        private bool _pbLowGain;

        // Constants from mt4.h
        private const int kStepSizeRes = 3;
        private const int kNeverHappens = -10000;
        private const int kFrameTime = 5;
        private const int pct = 655;
        private const int k100percent = 0x10000;

        // Pitch buffer event flags
        private const int kResetDecline = 0x8;
        private const int kPhraseReset = 0x10;
        private const int kPitchRiseFall_Flg = 0x2;
        private const int kPitchRiseFall1_Flg = 0x20;
        private const int kPitchStress_Flg = 0x1;

        // Phoneme flags
        private const uint kVoicedF = (1 << 2);
        private const uint kVowelF = (1 << 0);
        private const uint kVowel1F = (1 << 3);
        private const uint kGStopF = (1 << 20);
        private const uint kStopF = (1 << 12);

        // PhonCtrl field masks
        private const long kSyllableTypeField = 0x0F;
        private const long kWord_End = 0x0001;
        private const long kPrep_End = 0x0002;
        private const long kMid_Syllable_In_Word = 0x0200;
        private const long kPrimOrEmphStress = 0x1400;

        // _SIL_ phoneme index
        private const int _SIL_ = 23;
        // _YU_ phoneme index
        private const int _YU_ = 16;

        public PitchInterpolator(SynthInputDump dump)
        {
            _dump = dump;
            PitchState s = dump.Pitch;

            _nextPitchBufTime = s.NextPitchBufTime;
            _pitchBufOutIndex = s.PitchBufOutIndex;
            _curPitchBufTime = s.CurPitchBufTime;
            _curPitchBufPitch = s.CurPitchBufPitch;
            _curPitchBufFlags = s.CurPitchBufFlags;

            _phonIndexTarg = s.PhonIndexTarg;
            _timeIntoPhonTarg = s.TimeIntoPhonTarg;
            _curPhonDurCc = s.CurPhonDurCc;
            _phonDurDelay = s.PhonDurDelay;

            _phonIndexCp = s.PhonIndexCp;
            _timeIntoPhonCp = s.TimeIntoPhonCp;
            _curPhonDurCp = s.CurPhonDurCp;

            _uvPhonPitchTarg = s.UvPhonPitchTarg;
            _phonPitchOffset1 = s.PhonPitchOffset1;

            _fallRiseOffset = s.FallRiseOffset;
            _fallRise1Offset = s.FallRise1Offset;
            _stressTarget = s.StressTarget;
            _punctOffset = s.PunctOffset;
            _stressActiveTime = s.StressActiveTime;

            _baselineStartOffset = s.BaselineStartOffset;
            _baselineEndOffset = s.BaselineEndOffset;
            _downRampOffset = s.DownRampOffset;
            _downRampStep = s.DownRampStep;
            _rampSteps = s.RampSteps;
            _curRamp = s.CurRamp;

            _pFilterOut1 = s.PFilterOut1;
            _pFilterOut2 = s.PFilterOut2;
            _pFilterInGain = s.PFilterInGain;
            _pFilterFbGain = s.PFilterFbGain;

            _vpIntonation = s.VpIntonation;
            _vpPitchRange = s.VpPitchRange;
            _vpBaselinePitch = s.VpBaselinePitch;

            _vibratoDepth1 = s.VibratoDepth1;
            _vibratoDepth2 = s.VibratoDepth2;
            _vibratoFreq = s.VibratoFreq;
            _vibratoPhase1 = s.VibratoPhase1;

            _singing = s.Singing != 0;
            _hzGlide = s.HzGlide != 0;
            _musicalNoteActive = s.MusicalNoteActive != 0;
            _portamentoAccum = s.PortamentoAccum;
            _portamentoStep = s.PortamentoStep;
            _newPortaTarget = s.NewPortaTarget != 0;
            _newSentence = s.NewSentence != 0;
            _speechRate = s.SpeechRate;

            _pitchBoundry = s.PitchBoundry;
            _lowGainCp = s.LowGainCp != 0;

            _voiceNaturalPitch = s.VpBaselinePitch;

            _pbHold = kNeverHappens;
            _pbLowGain = false;
        }

        private int _controlF0;
        private int _voiceNaturalPitch; // initialized to vpBaselinePitch at construction
        private long _curPhonCtrlSinging; // ctrl of phoneme currently being rendered (singing path)

        public short Step()
        {
            Interpolate_Pitch();
            return (short)_controlF0;
        }

        private const long kLowVibrato = 0x10L;

        // Called by SpeechRenderer at the start of each phoneme (equivalent to StartNewPhon/DoNote in C)
        public void DoNote(int phonIndex)
        {
            _hzGlide = false;
            _curPhonCtrlSinging = GetPhonCtrl(phonIndex);

            long ctrl = (phonIndex >= 0 && phonIndex < _dump.PhonCtrlBuf2.Length)
                        ? _dump.PhonCtrlBuf2[phonIndex] : 0;

            // If outside a singing block, musical note context ends
            if ((ctrl & kSingingPhon) == 0)
                _musicalNoteActive = false;

            short note = (phonIndex >= 0 && phonIndex < _dump.UserNoteBuf2.Length)
                         ? _dump.UserNoteBuf2[phonIndex] : (short)0;

            if (note != 0 && (ctrl & kSilenceDuration) == 0)
            {
                if ((ctrl & kSingingPhon) != 0)
                {
                    if (note < 0)
                    {
                        // Raw Hz, portamento glide, no vibrato
                        int targetPitch = HzToPitch(-note);
                        int curPitch = (int)(_portamentoAccum >> 16);
                        int frames = (phonIndex < _dump.DurBuf.Length) ? _dump.DurBuf[phonIndex] : 1;
                        if (frames < 1) frames = 1;
                        _vpBaselinePitch = targetPitch;
                        _portamentoStep = ((long)(targetPitch - curPitch) << 16) / frames;
                        _newPortaTarget = true;
                        _hzGlide = true;
                    }
                    else
                    {
                        // Note name (positive Hz) — IIR settle (~100ms) then vibrato
                        int targetPitch = HzToPitch(note);
                        _vpBaselinePitch = targetPitch;
                        _portamentoStep = 0;
                        _newPortaTarget = true;
                        _musicalNoteActive = true;
                    }
                }
                else
                {
                    // EC_note, semitone offset above voiceNaturalPitch
                    int n = (note & 0xFF) << 8;
                    if (n != 0x7F00)
                    {
                        _vpBaselinePitch = _voiceNaturalPitch + ((n * 0x1555) >> 16);
                        if (_vpBaselinePitch < 0) _vpBaselinePitch = 0;
                    }
                }
            }
        }

        private static int HzToPitch(int hz)
        {
            if (hz <= 0) return 0;
            int freq, fk;
            if (hz < 100) { freq = hz << 3; fk = 0x000; }
            else if (hz < 200) { freq = hz << 2; fk = 0x100; }
            else if (hz < 400) { freq = hz << 1; fk = 0x200; }
            else if (hz < 800) { freq = hz; fk = 0x300; }
            else if (hz < 1600) { freq = hz >> 1; fk = 0x400; }
            else if (hz < 3200) { freq = hz >> 2; fk = 0x500; }
            else { freq = hz >> 3; fk = 0x600; }
            int ratio = ((freq - 400) * 2621) >> 11;
            if (ratio < 0) ratio = 0;
            if (ratio >= Tables.logOf2Tbl.Length) ratio = Tables.logOf2Tbl.Length - 1;
            return Tables.logOf2Tbl[ratio] + fk;
        }

        private const long kSingingDuration = 0x40000000L;
        private const long kSingingPhon = 0x20000000L;
        private const long kSilenceDuration = 0x01000000L;

        private short GetPhon(int index)
        {
            if (index >= 0 && index < _dump.PhonBuf2InIndex)
                return _dump.PhonBuf2[index];
            return _SIL_;
        }

        private long GetPhonCtrl(int index)
        {
            if (index >= 0 && index < _dump.PhonBuf2InIndex)
                return _dump.PhonCtrlBuf2[index];
            return 0;
        }

        private void Phon_Boundry_Pitch()
        {
            if (_timeIntoPhonCp >= _curPhonDurCp)
            {
                _timeIntoPhonCp -= _curPhonDurCp;
                _phonIndexCp++;
                _curPhonDurCp = (_phonIndexCp < _dump.DurBuf.Length) ? _dump.DurBuf[_phonIndexCp] : 0;

                int curPhon = GetPhon(_phonIndexCp);
                uint curFlags = Tables.PhonFlags2[curPhon];
                long curCtrl = GetPhonCtrl(_phonIndexCp + 1);

                int nextPhon = GetPhon(_phonIndexCp + 1);
                uint nextFlags = Tables.PhonFlags2[nextPhon];
                long nextCtrl = GetPhonCtrl(_phonIndexCp + 1);

                if (_pitchBoundry == 0)
                    _pitchBoundry = kNeverHappens;
                if (_pitchBoundry > 0)
                    _pitchBoundry = 0;

                _pbHold = kNeverHappens;
                _pbLowGain = false;

                if ((curFlags & kVowel1F) != 0
                    && (nextCtrl & kMid_Syllable_In_Word) == 0
                    && ((curCtrl & kSyllableTypeField) >= kWord_End)
                    && nextPhon != _YU_)
                {
                    if ((curFlags & kVowelF) != 0)
                    {
                        if (curPhon == nextPhon && (nextCtrl & kPrimOrEmphStress) != 0)
                        {
                            _pbHold = _curPhonDurCp;
                        }
                        else if ((curCtrl & kSyllableTypeField) >= kPrep_End)
                        {
                            _pbHold = _curPhonDurCp;
                            _pbLowGain = true;
                        }
                    }
                    else
                    {
                        if ((curFlags & kStopF) == 0
                            && curPhon != 53 // _DX_
                            && (nextCtrl & kPrimOrEmphStress) != 0)
                        {
                            _pbHold = _curPhonDurCp;
                        }
                    }
                }

                if ((nextFlags & kGStopF) != 0)
                    _pbHold = _curPhonDurCp;

                if ((curFlags & kGStopF) != 0)
                {
                    _pbHold = _curPhonDurCp;
                    return; // goto Exit
                }
            }

            int timeAt50 = 50 / kFrameTime;  // = 10
            int lastFrame = _curPhonDurCp - 1;
            if (_timeIntoPhonCp == timeAt50 || _timeIntoPhonCp == lastFrame)
            {
                _pitchBoundry = _pbHold;
                _lowGainCp = _pbLowGain;
            }
        }

        private void Interpolate_Pitch()
        {
            // Pitch buffer event collection loop
            bool collect = true;
            do
            {
                if (_curPitchBufTime >= _nextPitchBufTime
                    && _pitchBufOutIndex < (int)_dump.PitchBufInIndex)
                {
                    _curPitchBufPitch = _dump.PitchBufFreq[_pitchBufOutIndex];
                    _curPitchBufFlags = _dump.PitchBufFlags[_pitchBufOutIndex];

                    _curPitchBufTime -= _nextPitchBufTime;
                    _pitchBufOutIndex++;

                    _nextPitchBufTime = _dump.PitchBufTime[_pitchBufOutIndex];

                    if ((_curPitchBufFlags & kResetDecline) != 0)
                    {
                        _downRampOffset = 0;
                    }
                    else if ((_curPitchBufFlags & kPhraseReset) != 0)
                    {
                        _downRampOffset = (long)(_baselineStartOffset - _baselineEndOffset) << 14;
                        if (_curRamp < _rampSteps.Length - 1)
                            _curRamp++;
                        _downRampStep = _rampSteps[_curRamp];
                        _fallRiseOffset = 0;
                        _stressTarget = 0;
                        _punctOffset = 0;
                    }
                    else if ((_curPitchBufFlags & kPitchRiseFall_Flg) != 0)
                    {
                        _fallRiseOffset += _curPitchBufPitch;
                        if (_curPitchBufPitch < 0)
                        {
                            if (_stressTarget > 0) _stressTarget = 0;
                        }
                        else
                        {
                            if (_stressTarget < 0) _stressTarget = 0;
                        }
                    }
                    else if ((_curPitchBufFlags & kPitchRiseFall1_Flg) != 0)
                    {
                        _fallRise1Offset += _curPitchBufPitch;
                    }
                    else if ((_curPitchBufFlags & kPitchStress_Flg) != 0)
                    {
                        _stressTarget = _curPitchBufPitch;
                        _stressActiveTime = (int)_dump.Pitch.StressDuration;
                    }
                    else
                    {
                        _punctOffset = _curPitchBufPitch << 1;
                    }
                }
                else
                {
                    collect = false;
                }
            }
            while (collect);

            if (!_singing)
            {
                // Baseline fall
                int userPitch = (_phonIndexTarg >= 0 && _phonIndexTarg < _dump.UserPitchBuf2.Length)
                                ? _dump.UserPitchBuf2[_phonIndexTarg] : 0;
                int baseLineOffset = _baselineStartOffset - (int)(_downRampOffset >> 16) + userPitch;

                if (baseLineOffset > _baselineEndOffset)
                    _downRampOffset += _downRampStep;

                // Stress timer
                _stressActiveTime--;
                if (_stressActiveTime < 0)
                    _stressTarget = 0;

                // Phoneme target advance
                if (_timeIntoPhonTarg > _curPhonDurCc + _phonDurDelay
                    && _phonIndexTarg < _dump.PhonBuf2InIndex)
                {
                    _timeIntoPhonTarg -= _curPhonDurCc;
                    _phonIndexTarg++;
                    _curPhonDurCc = (_phonIndexTarg < _dump.DurBuf.Length) ? _dump.DurBuf[_phonIndexTarg] : 0;
                    _phonDurDelay = 0;

                    int curPhon = GetPhon(_phonIndexTarg);
                    long curCtrl = GetPhonCtrl(_phonIndexTarg);
                    uint curFlags = Tables.PhonFlags2[curPhon];
                    int nextPhon = GetPhon(_phonIndexTarg + 1);
                    uint nextFlags = Tables.PhonFlags2[nextPhon];

                    int phonPitchOffset = Tables.phonPitchTbl[curPhon];
                    phonPitchOffset >>= 1; // always 50%

                    if ((nextFlags & kVoicedF) == 0)
                        _phonDurDelay = 25 / kFrameTime; // = 5

                    if ((curFlags & kVoicedF) != 0)
                    {
                        _phonPitchOffset1 = phonPitchOffset << 1;
                        _uvPhonPitchTarg = 0;
                    }
                    else
                    {
                        _uvPhonPitchTarg = phonPitchOffset << kStepSizeRes;
                        _phonPitchOffset1 = 0;
                        if ((curFlags & kStopF) != 0)
                            _phonDurDelay = 30 / kFrameTime; // = 6
                        else
                            _phonDurDelay = 0;
                    }
                }

                Phon_Boundry_Pitch();

                // Pitch target
                int phonPitchTarget = (int)(((long)(_stressTarget + _fallRiseOffset + _punctOffset + baseLineOffset) * _vpIntonation) >> 16);
                phonPitchTarget = (short)phonPitchTarget; // C truncates to short here
                phonPitchTarget = (phonPitchTarget + _phonPitchOffset1) << kStepSizeRes;

                // IIR filter init on new sentence
                if (_newSentence)
                {
                    _pFilterOut1 = _vpBaselinePitch;
                    _pFilterOut2 = _vpBaselinePitch;
                    _newSentence = false;
                }

                // IIR filter
                _pFilterOut1 = ((_pFilterInGain * phonPitchTarget) + (_pFilterFbGain * _pFilterOut1)) >> 16;
                _pFilterOut2 = ((_pFilterInGain * (_pFilterOut1 + _uvPhonPitchTarg)) + (_pFilterFbGain * _pFilterOut2)) >> 16;

                int basePitchOffset = (int)(_pFilterOut2 >> kStepSizeRes);

                // Phoneme boundary envelope 
                int pbIndex = _timeIntoPhonCp - _pitchBoundry;
                if (pbIndex < 0) pbIndex = -pbIndex;

                const int kPbWindow = 45 / kFrameTime; // 9
                if (pbIndex <= kPbWindow)
                {
                    if (_lowGainCp)
                        basePitchOffset += pbIndex * (10 / kPbWindow) - 10;
                    else
                        basePitchOffset += pbIndex * (80 / kPbWindow) - 80;
                }

                // controlF0 
                _phonPitchOffset1 = (int)(((long)_phonPitchOffset1 * 98 * pct) >> 16);
                _controlF0 = (int)((((long)basePitchOffset * _vpPitchRange) >> 16) + _vpBaselinePitch);

                // Vibrato 
                _vibratoPhase1 = (int) (_vibratoPhase1 + _vibratoFreq) & 0x00FFFFFF;

                double phaseNorm = (double)_vibratoPhase1 / 16777216.0;
                double angle = phaseNorm * 2.0 * Math.PI;
                int vibrato = (int)(Math.Sin(angle) * 128.0);

                if (_speechRate >= 100)
                    _controlF0 += (int)((vibrato * _vibratoDepth1) >> 16);
                else
                    _controlF0 += (int)((vibrato * _vibratoDepth2) >> 16);
            }
            else
            {
                // Singing mode 
                if (_newSentence)
                {
                    _portamentoAccum = (long)_vpBaselinePitch << 16;
                    _newSentence = false;
                    _newPortaTarget = false;
                }
                else if (_newPortaTarget)
                {
                    if (_portamentoStep > 0)
                    {
                        _portamentoAccum += _portamentoStep;
                        if ((_portamentoAccum >> 16) >= _vpBaselinePitch)
                        {
                            _portamentoAccum = (long)_vpBaselinePitch << 16;
                            _newPortaTarget = false;
                        }
                    }
                    else if (_portamentoStep < 0)
                    {
                        _portamentoAccum += _portamentoStep;
                        if ((_portamentoAccum >> 16) < _vpBaselinePitch)
                        {
                            _portamentoAccum = (long)_vpBaselinePitch << 16;
                            _newPortaTarget = false;
                        }
                    }
                    else if (_singing)
                    {
                        long target = (long)_vpBaselinePitch << 16;
                        long diff = target - _portamentoAccum;
                        _portamentoAccum += diff >> 2;
                        if (diff > -0x10000L && diff < 0x10000L)
                        {
                            _portamentoAccum = target;
                            _newPortaTarget = false;
                        }
                    }
                    else
                    {
                        _portamentoAccum = (long)_vpBaselinePitch << 16;
                        _newPortaTarget = false;
                    }
                }

                _controlF0 = (int)(_portamentoAccum >> 16);

                // advance 24-bit phase accumulator
                _vibratoPhase1 = (int)((_vibratoPhase1 + _vibratoFreq) & 0xFFFFFF);

                // convert phase → radians
                double phaseNorm = (double)_vibratoPhase1 / 16777216.0; // 2^24
                double angle = phaseNorm * 2.0 * Math.PI;

                // generate vibrato in same range as table (-128..127)
                int vibrato = (int)(Math.Sin(angle) * 128.0);

                if (!_hzGlide && _musicalNoteActive) {
                    long depth = (_curPhonCtrlSinging & kLowVibrato) != 0 ? _vibratoDepth2: _vibratoDepth1;
                    _controlF0 += (int)((vibrato * depth) >> 16);
                }


            }

            if (_controlF0 < 0) _controlF0 = 0;

            _curPitchBufTime++;
            _timeIntoPhonTarg++;
            _timeIntoPhonCp++;
        }
    }
}  // namespace